From a4026570aaaf0ce8d1964488b14ea4028104df8d Mon Sep 17 00:00:00 2001 From: powturbo Date: Mon, 5 Jan 2015 23:13:49 +0100 Subject: [PATCH] New functions + Integer Lists Intersection --- README.md | 78 +- bitpack.c | 6 +- bitpack.h | 10 +- bitpack64_.h | 4 +- bitpack_.h | 92 +- bitunpack.c | 47 +- bitunpack.h | 67 +- bitunpack64_.h | 60 +- bitunpack_.h | 13 +- conf.h | 34 +- ext/OPT_PFD/main.cpp | 101 + ext/OPT_PFD/opt_p4.h | 54 + ext/OPT_PFD/pf.h | 158 + ext/OPT_PFD/s16head.h | 251 + ext/OPT_PFD/unpack.h | 773 + ext/simdcomp/bitpacka.c | 17773 +++++++++++ ext/simdcomp/bitpacka.h | 28 + ext/simdcomp/example.c | 66 + ext/simdcomp/include/simdbitpacking.h | 21 + ext/simdcomp/include/simdcomp.h | 12 + ext/simdcomp/include/simdcomputil.h | 29 + .../include/simdintegratedbitpacking.h | 27 + ext/simdcomp/makefile | 54 + ext/simdcomp/src/simdbitpacking.c | 14009 +++++++++ ext/simdcomp/src/simdcomputil.c | 56 + ext/simdcomp/src/simdintegratedbitpacking.c | 24872 ++++++++++++++++ ext/simdcomp/src/unit.c | 63 + ext/simple8b.c | 330 + ext/simple8b.h | 2 + ext/vabyte.h | 99 + ext/varintg8iu.c | 182 + ext/varintg8iu.h | 5 + ext/vas16c.h | 36 + ext/vas16d.h | 403 + ext/vbyte_poly.h | 46 + icbench.c | 907 +- idx.h | 20 + idxcr.c | 153 + idxqry.c | 364 + makefile | 43 +- vint.h | 52 +- vp4dc.c | 10 +- vp4dc.h | 17 +- vp4dc_.h | 37 +- vp4dd.c | 13 +- vp4dd.h | 55 +- vp4dd_.h | 121 +- vsimple.c | 9 +- vsimple.h | 24 +- vsimple_.h | 85 +- 50 files changed, 60996 insertions(+), 775 deletions(-) create mode 100644 ext/OPT_PFD/main.cpp create mode 100644 ext/OPT_PFD/opt_p4.h create mode 100644 ext/OPT_PFD/pf.h create mode 100644 ext/OPT_PFD/s16head.h create mode 100644 ext/OPT_PFD/unpack.h create mode 100644 ext/simdcomp/bitpacka.c create mode 100644 ext/simdcomp/bitpacka.h create mode 100644 ext/simdcomp/example.c create mode 100644 ext/simdcomp/include/simdbitpacking.h create mode 100644 ext/simdcomp/include/simdcomp.h create mode 100644 ext/simdcomp/include/simdcomputil.h create mode 100644 ext/simdcomp/include/simdintegratedbitpacking.h create mode 100644 ext/simdcomp/makefile create mode 100644 ext/simdcomp/src/simdbitpacking.c create mode 100644 ext/simdcomp/src/simdcomputil.c create mode 100644 ext/simdcomp/src/simdintegratedbitpacking.c create mode 100644 ext/simdcomp/src/unit.c create mode 100644 ext/simple8b.c create mode 100644 ext/simple8b.h create mode 100644 ext/vabyte.h create mode 100644 ext/varintg8iu.c create mode 100644 ext/varintg8iu.h create mode 100644 ext/vas16c.h create mode 100644 ext/vas16d.h create mode 100644 ext/vbyte_poly.h create mode 100644 idx.h create mode 100644 idxcr.c create mode 100644 idxqry.c diff --git a/README.md b/README.md index 5fbe05e..8192776 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,30 @@ TurboPFor: Fastest Integer Compression [![Build Status](https://travis-ci.org/powturbo/TurboPFor.svg?branch=master)](https://travis-ci.org/powturbo/TurboPFor) ====================================== -- 100% C, without inline assembly +- 100% C/C++, without inline assembly

- Fastest **"Variable Byte"** implementation

- Novel **"Variable Simple"** faster than simple16 and more compact than simple64

-- Scalar **"Binary Packing"** with bulk decoding as fast as SIMD FastPFor in realistic (No "pure cache") scenarios -- Binary Packing with **Direct/Random Access** without decompressing entire blocks -- Access any single binary packed entry with **zero decompression** +- Scalar **"Bit Packing"** with bulk decoding as fast as SIMD FastPFor in realistic and practical (No "pure cache") scenarios +- Bit Packing with **Direct/Random Access** without decompressing entire blocks +- Access any single bit packed entry with **zero decompression** +- Reducing **Cache Pollution**

-- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding +- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding. + Outstanding compression

- Several times faster than other libraries -- Usage as easy as memcpy -- Instant access to compressed *frequency* and *position* data in inverted index with zero decoding - +- Usage in C/C++ as easy as memcpy +- Most functions optimized for speed and others for high compression ratio +- **New:** Include more functions +

+- Instant access to compressed *frequency* and *position* data in inverted index with zero decompression +- **New:** Inverted Index Demo + Benchmarks: Intersection of lists of sorted integers. +- more than **1000 queries per second** on gov2 (25 millions documents) on a **SINGLE** core. +- Decompress only the minimum necessary blocks. + # Benchmark: i7-2600k at 3.4GHz, gcc 4.9, ubuntu 14.10. - Single thread @@ -47,23 +55,63 @@ coming soon! ## Compile: make -## Usage +## Benchmark ###### Synthetic data: - 1. test all functions
+ 1. test all functions ./icbench -a1.0 -m0 -x8 -n100000000 - zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution) - number of integers = 100000000 - integer range from 0 to 255 (integer size = 0 to 8 bits) - 2. individual function test (ex. copy TurboPack TurboPack Direct access)
- ./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopack,da -n100000000 + 2. individual function test (ex. copy TurboPack TurboPack Direct access) + ./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopackda -n100000000 ###### Data files: - - Data file Benchmark (file format as in FastPFOR)
- ./icbench -n10000000000 clueweb09.sorted + - Data file Benchmark (file format as in FastPFOR) + ./icbench gov2.sorted + +###### Benchmarking intersections + - Download "gov2.sorted" (or clueweb09) + query file "aol.txt" + from "http://lemire.me/data/integercompression2014.html" + + - Create index file gov2.sorted.i + ./idxcr gov2.sorted . + create inverted index file "gov2.sorted.i" in the current directory + + - Benchmarking intersections + ./idxqry gov2.sorted.i aol.txt + run queries in file "aol.txt" over the index of gov2 file + + 8GB Minimum of RAM required (16GB recommended for benchmarking "clueweb09" files). + + +## Function usage: +In general compression/decompression functions are of the form: + + char *endptr = compress( unsigned *in, int n, char *out) + endptr : set by compress to the next character in "out" after the compressed buffer + in : input integer array + n : number of elements + out : pointer to output buffer + + char *endptr = decompress( char *in, int n, unsigned *out) + endptr : set by decompress to the next character in "in" after the decompressed buffer + in : pointer to input buffer + n : number of elements + out : output integer array + +header files with documentation : + vint.h - variable byte + vsimple.h - variable simple + vp4dc.h,vp4dd.h - TurboPFor + bitpack.h,bitunpack.h - Bit Packing + ## Reference: - - "SIMD-BitPack FPF" from FastPFor https://github.com/lemire/simdcomp + - "SIMD-BitPack FPF" from FastPFor https://github.com/lemire/simdcomp + - Sorted integer datasets from http://lemire.me/data/integercompression2014.html - OptP4 and Simple-16 from http://jinruhe.com/ +#---------------------------------------------------------------------------------- + diff --git a/bitpack.c b/bitpack.c index e364984..33a8dfb 100644 --- a/bitpack.c +++ b/bitpack.c @@ -16,7 +16,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo @@ -29,6 +29,6 @@ #define PAD8(__x) ( (((__x)+8-1)/8) ) -unsigned char *bitpack32(unsigned *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } -unsigned char *bitpack16(unsigned short *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } +unsigned char *bitpack32(unsigned *__restrict in, int n, int nb, unsigned char *__restrict out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } +unsigned char *bitpack16(unsigned short *__restrict in, int n, int nb, unsigned char *__restrict out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } diff --git a/bitpack.h b/bitpack.h index 77dee67..86a1431 100644 --- a/bitpack.h +++ b/bitpack.h @@ -16,15 +16,17 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - bitpack.c - "Integer Compression" binary packing + bitpack.c - "Integer Compression" Binary Packing **/ -unsigned char *bitpack16( unsigned short *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out); -unsigned char *bitpack32( unsigned *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out); +// Pack array with n unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out +unsigned char *bitpack32( unsigned *__restrict in, int n, int nbits, unsigned char *__restrict out); +// like bitpack32 but for 16 bits arrays +unsigned char *bitpack16( unsigned short *__restrict in, int n, int nbits, unsigned char *__restrict out); diff --git a/bitpack64_.h b/bitpack64_.h index d74b27c..4ce1e8d 100644 --- a/bitpack64_.h +++ b/bitpack64_.h @@ -16,12 +16,12 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - bitpack64_.h - "Integer Compression" binary packing + bitpack64_.h - "Integer Compression" bit packing include file **/ #define BITBLK32_1(ip, i, op, parm) { ; register uint32_t w;;\ diff --git a/bitpack_.h b/bitpack_.h index 3fcabd2..2ca534e 100644 --- a/bitpack_.h +++ b/bitpack_.h @@ -16,14 +16,13 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - bitpack_.h - "Integer Compression" binary packing + bitpack_.h - "Integer Compression" bit packing **/ - #include #define USE_BITPACK 64 #if 0 @@ -77,8 +76,8 @@ case 32:do BITPACK64_32(__ip, __op, __parm) while(__ip < __ipe);\ }\ } while(0) - #elif USE_BITPACK == 32 -#include "bitpack32_.h" + #else +#include "bitpack32_.h" // Not included in the github package #define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ switch(__nbits) {\ case 0:__ip = __ipe; break;\ @@ -114,87 +113,6 @@ case 30:do BITPACK32_30(__ip, __op, __parm) while(__ip < __ipe); break;\ case 31:do BITPACK32_31(__ip, __op, __parm) while(__ip < __ipe); break;\ case 32:do BITPACK32_32(__ip, __op, __parm) while(__ip < __ipe);\ - } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\ -} while(0) - #else - #if 1 -#define SRCI(__ip) __ip+=32 -#define SRC(__ip,__x) __ip[__x] -#define SRCP( __ip) - #else -#define SRCI(__ip) -#define SRC( __ip,__x) (*__ip++) -//#define SRCP( __ip) (__ip++) - #endif -#include "pack/bitpack32_1.h" -#include "pack/bitpack32_2.h" -#include "pack/bitpack32_3.h" -#include "pack/bitpack32_4.h" -#include "pack/bitpack32_5.h" -#include "pack/bitpack32_6.h" -#include "pack/bitpack32_7.h" -#include "pack/bitpack32_8.h" -#include "pack/bitpack32_9.h" -#include "pack/bitpack32_10.h" -#include "pack/bitpack32_11.h" -#include "pack/bitpack32_12.h" -#include "pack/bitpack32_13.h" -#include "pack/bitpack32_14.h" -#include "pack/bitpack32_15.h" -#include "pack/bitpack32_16.h" -#include "pack/bitpack32_17.h" -#include "pack/bitpack32_18.h" -#include "pack/bitpack32_19.h" -#include "pack/bitpack32_20.h" -#include "pack/bitpack32_21.h" -#include "pack/bitpack32_22.h" -#include "pack/bitpack32_23.h" -#include "pack/bitpack32_24.h" -#include "pack/bitpack32_25.h" -#include "pack/bitpack32_26.h" -#include "pack/bitpack32_27.h" -#include "pack/bitpack32_28.h" -#include "pack/bitpack32_29.h" -#include "pack/bitpack32_30.h" -#include "pack/bitpack32_31.h" -#include "pack/bitpack32_32.h" -#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ - switch(__nbits) {\ - case 0:__ip = __ipe; break;\ - case 1:do BITPACK_1( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 2:do BITPACK_2( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 3:do BITPACK_3( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 4:do BITPACK_4( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 5:do BITPACK_5( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 6:do BITPACK_6( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 7:do BITPACK_7( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 8:do BITPACK_8( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 9:do BITPACK_9( __ip, __op, __parm) while(__ip < __ipe); break;\ - case 10:do BITPACK_10(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 11:do BITPACK_11(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 12:do BITPACK_12(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 13:do BITPACK_13(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 14:do BITPACK_14(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 15:do BITPACK_15(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 16:do BITPACK_16(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 17:do BITPACK_17(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 18:do BITPACK_18(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 19:do BITPACK_19(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 20:do BITPACK_20(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 21:do BITPACK_21(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 22:do BITPACK_22(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 23:do BITPACK_23(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 24:do BITPACK_24(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 25:do BITPACK_25(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 26:do BITPACK_26(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 27:do BITPACK_27(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 28:do BITPACK_28(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 29:do BITPACK_29(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 30:do BITPACK_30(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 31:do BITPACK_31(__ip, __op, __parm) while(__ip < __ipe); break;\ - case 32:do BITPACK_32(__ip, __op, __parm) while(__ip < __ipe);\ - } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\ + }\ } while(0) #endif -// - diff --git a/bitunpack.c b/bitunpack.c index 830ad4b..14550ad 100644 --- a/bitunpack.c +++ b/bitunpack.c @@ -16,41 +16,50 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - bitunpack_.h - "Integer Compression" binary packing + bitunpack_.h - "Integer Compression" Binary Packing **/ - +#include "conf.h" #include "bitunpack.h" - #define PAD8(__x) (((__x)+7)/8) -unsigned char * bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i; for(i=0; i < n; i++ ) out[i] = bitgetx32(in, b, i); return in + PAD8(n*b); } -unsigned char *_bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); } -#define BPI(__w,__parm) __w +//----------------------------------------------------------------------------------------------------------------- +#define BPI(__w, __op, __parm) __w #include "bitunpack_.h" -unsigned char *bitunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; } -unsigned char *bitunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned short *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; } +unsigned char *bitunpack32( unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; } +unsigned char *bitunpack16( unsigned char *__restrict in, unsigned n, unsigned b, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; } +#undef BPI + +//----------------------------------------------------------------------------------------------------------------- +#define BPI(__w, __op, __parm) (__parm += (__w) + 1) +#include "bitunpack_.h" +unsigned char *bitdunpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitdunpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } #undef BPI //------------------------------------------------------------------------------------------ -#define BPI(__w,__parm) (__parm += (__w) + 1) +#define BPI(__w, __op, __parm) (__parm += (__w)) #include "bitunpack_.h" - -unsigned char *bitdunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } -unsigned char *bitdunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); } -unsigned char *bitdunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +unsigned char *bitd0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitd0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } #undef BPI //------------------------------------------------------------------------------------------ -#define BPI(__w,__parm) (__parm + (__w) + 1) +#define BPI(__w, __op, __parm) (__parm + (__op+1-_op))//#define BPI(__w, __op, __parm) (__parm + (__w) + 1) #include "bitunpack_.h" - -unsigned char *bitfunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } -unsigned char *bitfunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+1; return in + PAD8(n*b); } -unsigned char *bitfunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +unsigned char *bitfunpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitfunpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +#undef BPI + +//------------------------------------------------------------------------------------------ +#define BPI(__w, __op, __parm) (__parm + (__op-_op)) +#include "bitunpack_.h" + +unsigned char *bitf0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } +unsigned char *bitf0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; } #undef BPI diff --git a/bitunpack.h b/bitunpack.h index ff1054c..bc572ff 100644 --- a/bitunpack.h +++ b/bitunpack.h @@ -16,36 +16,63 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - bitunpack.h - "Integer Compression" binary packing + bitunpack.h - "Integer Compression" Binary Packing **/ +#ifdef __cplusplus +extern "C" { +#endif -// BP -static inline unsigned bitgetx32(unsigned *__restrict__ in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return ((*(unsigned long long *)(in+(bidx>>5))) >> (bidx&0x1f)) & ((1ull<>5))) >> (bidx&0x1f)) & ((1ull<>4))) >> (bidx& 0xf)) & ((1 <>4))) >> (bidx& 0xf)) & ((1 < + #else +#define _bzhi_u64(__u, __b) ((__u) & ((1ull<<__b)-1)) +#define _bzhi_u32(__u, __b) ((__u) & ((1u <<__b)-1)) + #endif -unsigned char * bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out); -unsigned char *_bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out); +// Get a single 32 bits value with index "idx" (or bit index b*idx) from packed integer array +static ALWAYS_INLINE unsigned bitgetx32(unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return _bzhi_u64( (*(unsigned long long *)((unsigned *)in+(bidx>>5))) >> (bidx&0x1f), b ); } +static ALWAYS_INLINE unsigned _bitgetx32(unsigned char *__restrict in, unsigned b, unsigned bidx) { return _bzhi_u64( (*(unsigned long long *)((unsigned *)in+(bidx>>5))) >> (bidx&0x1f), b ); } + +// like bitgetx32 but for 16 bits integer array +static ALWAYS_INLINE unsigned bitgetx16(unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return _bzhi_u32( (*(unsigned *)((unsigned *)in+(bidx>>4))) >> (bidx& 0xf), b ); } +static ALWAYS_INLINE unsigned _bitgetx16(unsigned char *__restrict in, unsigned b, unsigned bidx) { return _bzhi_u32( (*(unsigned *)((unsigned *)in+(bidx>>4))) >> (bidx& 0xf), b ); } -// DFOR -unsigned char *bitdunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out); -unsigned char *bitdunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out); -unsigned char *bitdunpackb32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out); +// Set a single value with index "idx" +static ALWAYS_INLINE void bitsetx32(unsigned char *__restrict in, unsigned b, unsigned idx, unsigned v) { unsigned bidx = b*idx; unsigned long long *p = (unsigned long long *)((unsigned *)in+(bidx>>5)); *p = ( *p & ~(((1ull<>4) ; *p = ( *p & ~(((1u <= val) { *oidx=idx; return oval; } } return INT_MAX; } +// out[0] = start + in[0]; out[1] = out[0] + in[1]; ... ; out[i] = out[i-1] + in[i] +unsigned char *bitd0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out); +unsigned char *bitd0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out); + +// ---------------- DaFor : Direct Access for packed SORTED array (Ex. DocId in inverted index) -------------------------------------------- +// out[i] = start + in[i] + i + 1 +unsigned char *bitfunpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out); +unsigned char *bitfunpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out); + +// out[i] = start + in[i] + i +unsigned char *bitf0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out); +unsigned char *bitf0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out); + +#ifdef __cplusplus +} +#endif diff --git a/bitunpack64_.h b/bitunpack64_.h index 88ac332..8ad57b9 100644 --- a/bitunpack64_.h +++ b/bitunpack64_.h @@ -16,12 +16,12 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - bitunpack64_.c - "Integer Compression" binary packing + bitunpack64_.c - "Integer Compression" scalar bit packing **/ #define BITUNBLK32_0(ip, i, op, parm) { \ @@ -141,7 +141,7 @@ BITUNBLK64_2(ip, 0, op, parm); DSTI(op); ip += 2*4/sizeof(ip[0]);\ } -#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));register uint32_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ +#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ DST(op,i*64+ 0, (w0 ) & 0x7, parm);\ DST(op,i*64+ 1, (w0 >> 3) & 0x7, parm);\ DST(op,i*64+ 2, (w0 >> 6) & 0x7, parm);\ @@ -162,7 +162,7 @@ DST(op,i*64+17, (w0 >> 51) & 0x7, parm);\ DST(op,i*64+18, (w0 >> 54) & 0x7, parm);\ DST(op,i*64+19, (w0 >> 57) & 0x7, parm);\ - DST(op,i*64+20, (w0 >> 60) & 0x7, parm); \ + DST(op,i*64+20, (w0 >> 60) & 0x7, parm); register uint32_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ \ DST(op,i*64+21, (w0 >> 63) | (w1 << 1) & 0x7, parm);\ DST(op,i*64+22, (w1 >> 2) & 0x7, parm);\ @@ -181,28 +181,28 @@ BITUNBLK64_3(ip, 0, op, parm); DSTI(op); ip += 3*4/sizeof(ip[0]);\ } -#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip/*+(i*1+0)*8/sizeof(ip[0])*/);ip += 8/sizeof(ip[0]);\ - DST(op,i*16+ 0, (w0 ) & 0xf, parm);\ - DST(op,i*16+ 1, (w0 >> 4) & 0xf, parm);\ - DST(op,i*16+ 2, (w0 >> 8) & 0xf, parm);\ - DST(op,i*16+ 3, (w0 >> 12) & 0xf, parm);\ - DST(op,i*16+ 4, (w0 >> 16) & 0xf, parm);\ - DST(op,i*16+ 5, (w0 >> 20) & 0xf, parm);\ - DST(op,i*16+ 6, (w0 >> 24) & 0xf, parm);\ - DST(op,i*16+ 7, (w0 >> 28) & 0xf, parm);\ - DST(op,i*16+ 8, (w0 >> 32) & 0xf, parm);\ - DST(op,i*16+ 9, (w0 >> 36) & 0xf, parm);\ - DST(op,i*16+10, (w0 >> 40) & 0xf, parm);\ - DST(op,i*16+11, (w0 >> 44) & 0xf, parm);\ - DST(op,i*16+12, (w0 >> 48) & 0xf, parm);\ - DST(op,i*16+13, (w0 >> 52) & 0xf, parm);\ - DST(op,i*16+14, (w0 >> 56) & 0xf, parm);\ - DST(op,i*16+15, (w0 >> 60), parm);;\ +#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ + DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\ + DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\ } #define BITUNPACK64_4(ip, op, parm) { \ BITUNBLK64_4(ip, 0, op, parm);\ - BITUNBLK64_4(ip, 1, op, parm); DSTI(op); /*ip += 4*4/sizeof(ip[0]);*/\ + BITUNBLK64_4(ip, 1, op, parm); DSTI(op); ip += 4*4/sizeof(ip[0]);\ } #define BITUNBLK64_5(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ @@ -330,14 +330,14 @@ } #define BITUNBLK64_8(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ - DST(op,i*8+ 0, (w0 ) & 0xff, parm);\ - DST(op,i*8+ 1, (w0 >> 8) & 0xff, parm);\ - DST(op,i*8+ 2, (w0 >> 16) & 0xff, parm);\ - DST(op,i*8+ 3, (w0 >> 24) & 0xff, parm);\ - DST(op,i*8+ 4, (w0 >> 32) & 0xff, parm);\ - DST(op,i*8+ 5, (w0 >> 40) & 0xff, parm);\ - DST(op,i*8+ 6, (w0 >> 48) & 0xff, parm);\ - DST(op,i*8+ 7, (w0 >> 56) , parm);;\ + DST(op,i*8+ 0, (unsigned char)(w0 ), parm);\ + DST(op,i*8+ 1, (unsigned char)(w0 >> 8), parm);\ + DST(op,i*8+ 2, (unsigned char)(w0 >> 16), parm);\ + DST(op,i*8+ 3, (unsigned char)(w0 >> 24), parm);\ + DST(op,i*8+ 4, (unsigned char)(w0 >> 32), parm);\ + DST(op,i*8+ 5, (unsigned char)(w0 >> 40), parm);\ + DST(op,i*8+ 6, (unsigned char)(w0 >> 48), parm);\ + DST(op,i*8+ 7, (unsigned char)(w0 >> 56), parm);;\ } #define BITUNPACK64_8(ip, op, parm) { \ diff --git a/bitunpack_.h b/bitunpack_.h index 172e3d4..5c22dcb 100644 --- a/bitunpack_.h +++ b/bitunpack_.h @@ -16,23 +16,22 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo bitunpack_.h - "Integer Compression" binary packing **/ - #include -#define DST( __op,__x, __w, __parm) *__op++ = BPI(__w,__parm) //__op[__x] = BPI(__w,__parm) // +#define DST( __op,__x, __w, __parm) *__op++ = BPI(__w, __op, __parm) //__op[__x] = BPI(__w,__parm) // #define DSTI(__op) //__op += 32 // #define USE_BITUNPACK 64 #if USE_BITUNPACK == 64 #include "bitunpack64_.h" -#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\ +#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n,*_op=__op;\ switch(__nbits) {\ case 0: do BITUNPACK64_0( __ip, __op, __parm) while(__op<__ope); break;\ case 1: do BITUNPACK64_1( __ip, __op, __parm) while(__op<__ope); break;\ @@ -70,8 +69,8 @@ }\ } #elif USE_BITUNPACK == 32 -#include "bitunpack32_.h" -#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\ +#include "bitunpack32_.h" // Not included in the github package +#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;\ switch(__nbits) {\ case 0: do BITUNPACK32_0( __ip, __op, __parm) while(__op<__ope); break;\ case 1: do BITUNPACK32_1( __ip, __op, __parm) while(__op<__ope); break;\ @@ -106,7 +105,7 @@ case 30: do BITUNPACK32_30(__ip, __op, __parm) while(__op<__ope); break;\ case 31: do BITUNPACK32_31(__ip, __op, __parm) while(__op<__ope); break;\ case 32: do BITUNPACK32_32(__ip, __op, __parm) while(__op<__ope); break;\ - } /*printf("n=%d,%d,%d ", __n, __op, __parm - sd, __op, __parme - __op);*/\ + }\ } #endif diff --git a/conf.h b/conf.h index 2383ad1..185fbdf 100644 --- a/conf.h +++ b/conf.h @@ -16,19 +16,20 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo conf.h - "Integer Compression" config & common **/ - +#ifndef CONF_H +#define CONF_H #if defined(__GNUC__) #define ALIGNED(t,v,n) __attribute__ ((aligned (n))) t v -#define ALWAYS_INLINE __attribute__((always_inline)) -#define _PACKED __attribute__ ((packed)) +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define _PACKED __attribute__ ((packed)) #define likely(x) __builtin_expect((x),1) #define unlikely(x) __builtin_expect((x),0) @@ -48,6 +49,11 @@ static inline int bsr32(int x) { return b + 1; } +static inline int __bsr32(int x) { + asm("bsr %1,%0" : "=r" (x) : "rm" (x) ); + return x; +} + static inline int bsr64(unsigned long long x) { return x?64 - __builtin_clzll(x):0; } @@ -66,5 +72,25 @@ static inline int bsr64(unsigned long long x) { #else #error "only gcc support in this version" #endif +//--------------------------------------------------------------------------------------------------- +#define ctou8(__cp) (*(unsigned char *)(__cp)) +#define ctou16(__cp) (*(unsigned short *)(__cp)) +#define ctou24(__cp) ((*(unsigned *)(__cp)) & 0xffffff) +#define ctou32(__cp) (*(unsigned *)(__cp)) +#define ctou64(__cp) (*(unsigned long long *)(__cp)) +#define ctou48(__cp) ((*(unsigned long long *)(__cp)) & 0xffffffffffff) +#define ctou(__cp_t, __cp) (*(__cp_t *)(__cp)) + #ifndef min +#define min(x,y) (((x)<(y)) ? (x) : (y)) +#define max(x,y) (((x)>(y)) ? (x) : (y)) + #endif + #ifdef NDEBUG +#define AS(expr, fmt,args...) + #else +#include +#define AS(expr, fmt,args...) if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); } + #endif +#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0) +#endif diff --git a/ext/OPT_PFD/main.cpp b/ext/OPT_PFD/main.cpp new file mode 100644 index 0000000..2c0ec06 --- /dev/null +++ b/ext/OPT_PFD/main.cpp @@ -0,0 +1,101 @@ +/* + * test for OPT-pfd + * + * Author: sding + * + * + */ + + + +#include +#include +#include + +#include "opt_p4.h" + +using namespace std; + +char PATH[128] = "/usr/home/shuai/dumplist/wordlist_Excite"; // for reading list + +int get_list(char *term, unsigned int *doc_id, unsigned int *freq, unsigned int *maxc) +{ + char fpath[128]; + sprintf(fpath,"%s/%s",PATH,term); + FILE *fdd = fopen(fpath,"r"); + if(fdd==NULL) return 0; + + int nread, npos; + + nread = fread(&npos, sizeof(unsigned), 1, fdd); + npos = 0; + + while (nread > 0) + { + nread = fread(&doc_id[npos], sizeof(unsigned), 1, fdd); + if (nread <= 0) break; + fread(&freq[npos], sizeof(unsigned), 1, fdd); + npos++; + } + fclose(fdd); + + int i; + + /* fill out the max values */ + for (i = 0; i < npos; i += BS) + maxc[(i/BS)] = doc_id[i+BS-1]; + + /* take the gap for doc_id */ + for (i = npos-1; i > 0; i--) + { + doc_id[i] -= doc_id[i-1]; + doc_id[i] --; + } + + for (i = 0; i < npos; i++) + freq[i]--; + return npos; +} + +int main() // just for testing +{ + int MAX_NDOC = 25205179; + unsigned int *docid = new unsigned int[MAX_NDOC]; + unsigned int *docid_check = new unsigned int[MAX_NDOC ]; + + unsigned int *fre = new unsigned int[MAX_NDOC]; + unsigned int *maxc = new unsigned int[MAX_NDOC/BS]; + unsigned int *aux = new unsigned int[MAX_NDOC]; + unsigned int * all_array = new unsigned int[2048]; // extra array for coding + + + int listSize = get_list("information", docid, fre, maxc); + cout<<"list size is "< size * 4) // int bytes + { + chunk_size = size *4; + b = l; + temp_en = ex_n; + } + } + + csize += chunk_size; + //printf("encode:%u\n", b); + p4_encode(doc_id + j, BS, b, aux + offset, &size, &ex_n); + offset += size; + } + + return csize; +} diff --git a/ext/OPT_PFD/pf.h b/ext/OPT_PFD/pf.h new file mode 100644 index 0000000..788f8cc --- /dev/null +++ b/ext/OPT_PFD/pf.h @@ -0,0 +1,158 @@ +#include "s16head.h" +#include "unpack.h" + + +#define BS 128 +#define FRAC 0.10 +#define S 16 +#define PCHUNK 128 + +void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w); + + +int detailed_p4_encode(unsigned int **w, unsigned int* p, int num , int *chunk_size, int * exception_n) +{ + int i, j, t, s; + + unsigned int b = cnum[num]; + int bb_e; + int bb_p; + int p_low; + unsigned int e_n = 0; + int max_p = 0; + int max_e = 0; + + unsigned int* out = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + unsigned int* ex = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + unsigned int* po = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + + unsigned int* tp = NULL; + unsigned int *_pp, *_ww; + + if (b == 32) + { + (*w)[0] = ((b<<10)) + (0); + *w +=1; + for (i = 0; i < PCHUNK ; i++) (*w)[i] = p[i]; + *w += (PCHUNK); + (*chunk_size) = 1 + BS; + + free(out); + free(ex); + free(po); + return 0; + } + + for (i = 0; i < PCHUNK ; i++) + { + if ( p[i] >= (1<> b); + po[(e_n++)] = i; // + } + else + out[i] = p[i]; + } + + if (1) // force to pass every time + { + /*get the gap of position*/ + for(j = e_n-1;j>0;j--) + { + po[j] = po[j] - po[j-1] ; + po[j] --; + } + + s = ((b * PCHUNK)>>5); + tp = (*w); + (*w)[0] = ((num<<10))+e_n; // record b and number of exceptions into this value, in the other version we pick this value out and did not count it + (*w) += 1; + for (i = 0; i < s; i++) (*w)[i] = 0; + pack(out, b, PCHUNK , *w); + *w += s; + + unsigned int *all_array = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*4) ; + for(j=0;j>5; + s = 32 - b - (bp & 31); + if (s >= 0) + w[wp] |= (v[i]<>s); + w[wp+1] = (v[i]<<(32-s)); + } + } +} + +/*modified p4decode */ +unsigned int *detailed_p4_decode(unsigned int *_p, unsigned int *_w, unsigned int * all_array) +{ + + int i, s; + unsigned int x; + int flag = _w[0]; + (_w)++; + + unsigned int *_ww,*_pp; + unsigned int b = ((flag>>10) & 31); + unsigned int e_n = (flag & 1023) ; + + (unpack[b])(_p, _w); + + b = cnum[b]; + _w += ((b * BS)>>5); + unsigned int _k = 0; + unsigned int psum = 0; + if(e_n != 0 ) + { + for (_pp = all_array, _ww = (unsigned int *)(_w); _pp < &(all_array[e_n*2]);) + { + S16_DECODE(_ww, _pp); + } + + _w += (_ww - _w); + psum = all_array[0]; + + for(i=0;i>28; \ + switch(_k) \ + { \ + case 0: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 1; _p++; \ + *_p = (*_w>>8) & 1; _p++; \ + *_p = (*_w>>9) & 1; _p++; \ + *_p = (*_w>>10) & 1; _p++; \ + *_p = (*_w>>11) & 1; _p++; \ + *_p = (*_w>>12) & 1; _p++; \ + *_p = (*_w>>13) & 1; _p++; \ + *_p = (*_w>>14) & 1; _p++; \ + *_p = (*_w>>15) & 1; _p++; \ + *_p = (*_w>>16) & 1; _p++; \ + *_p = (*_w>>17) & 1; _p++; \ + *_p = (*_w>>18) & 1; _p++; \ + *_p = (*_w>>19) & 1; _p++; \ + *_p = (*_w>>20) & 1; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 1: \ + *_p = (*_w) & 3; _p++; \ + *_p = (*_w>>2) & 3; _p++; \ + *_p = (*_w>>4) & 3; _p++; \ + *_p = (*_w>>6) & 3; _p++; \ + *_p = (*_w>>8) & 3; _p++; \ + *_p = (*_w>>10) & 3; _p++; \ + *_p = (*_w>>12) & 3; _p++; \ + *_p = (*_w>>14) & 1; _p++; \ + *_p = (*_w>>15) & 1; _p++; \ + *_p = (*_w>>16) & 1; _p++; \ + *_p = (*_w>>17) & 1; _p++; \ + *_p = (*_w>>18) & 1; _p++; \ + *_p = (*_w>>19) & 1; _p++; \ + *_p = (*_w>>20) & 1; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 2: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 3; _p++; \ + *_p = (*_w>>9) & 3; _p++; \ + *_p = (*_w>>11) & 3; _p++; \ + *_p = (*_w>>13) & 3; _p++; \ + *_p = (*_w>>15) & 3; _p++; \ + *_p = (*_w>>17) & 3; _p++; \ + *_p = (*_w>>19) & 3; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 3: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 1; _p++; \ + *_p = (*_w>>8) & 1; _p++; \ + *_p = (*_w>>9) & 1; _p++; \ + *_p = (*_w>>10) & 1; _p++; \ + *_p = (*_w>>11) & 1; _p++; \ + *_p = (*_w>>12) & 1; _p++; \ + *_p = (*_w>>13) & 1; _p++; \ + *_p = (*_w>>14) & 3; _p++; \ + *_p = (*_w>>16) & 3; _p++; \ + *_p = (*_w>>18) & 3; _p++; \ + *_p = (*_w>>20) & 3; _p++; \ + *_p = (*_w>>22) & 3; _p++; \ + *_p = (*_w>>24) & 3; _p++; \ + *_p = (*_w>>26) & 3; _p++; \ + break; \ + case 4: \ + *_p = (*_w) & 3; _p++; \ + *_p = (*_w>>2) & 3; _p++; \ + *_p = (*_w>>4) & 3; _p++; \ + *_p = (*_w>>6) & 3; _p++; \ + *_p = (*_w>>8) & 3; _p++; \ + *_p = (*_w>>10) & 3; _p++; \ + *_p = (*_w>>12) & 3; _p++; \ + *_p = (*_w>>14) & 3; _p++; \ + *_p = (*_w>>16) & 3; _p++; \ + *_p = (*_w>>18) & 3; _p++; \ + *_p = (*_w>>20) & 3; _p++; \ + *_p = (*_w>>22) & 3; _p++; \ + *_p = (*_w>>24) & 3; _p++; \ + *_p = (*_w>>26) & 3; _p++; \ + break; \ + case 5: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 7; _p++; \ + *_p = (*_w>>7) & 7; _p++; \ + *_p = (*_w>>10) & 7; _p++; \ + *_p = (*_w>>13) & 7; _p++; \ + *_p = (*_w>>16) & 7; _p++; \ + *_p = (*_w>>19) & 7; _p++; \ + *_p = (*_w>>22) & 7; _p++; \ + *_p = (*_w>>25) & 7; _p++; \ + break; \ + case 6: \ + *_p = (*_w) & 7; _p++; \ + *_p = (*_w>>3) & 15; _p++; \ + *_p = (*_w>>7) & 15; _p++; \ + *_p = (*_w>>11) & 15; _p++; \ + *_p = (*_w>>15) & 15; _p++; \ + *_p = (*_w>>19) & 7; _p++; \ + *_p = (*_w>>22) & 7; _p++; \ + *_p = (*_w>>25) & 7; _p++; \ + break; \ + case 7: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 15; _p++; \ + *_p = (*_w>>8) & 15; _p++; \ + *_p = (*_w>>12) & 15; _p++; \ + *_p = (*_w>>16) & 15; _p++; \ + *_p = (*_w>>20) & 15; _p++; \ + *_p = (*_w>>24) & 15; _p++; \ + break; \ + case 8: \ + *_p = (*_w) & 31; _p++; \ + *_p = (*_w>>5) & 31; _p++; \ + *_p = (*_w>>10) & 31; _p++; \ + *_p = (*_w>>15) & 31; _p++; \ + *_p = (*_w>>20) & 15; _p++; \ + *_p = (*_w>>24) & 15; _p++; \ + break; \ + case 9: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 15; _p++; \ + *_p = (*_w>>8) & 31; _p++; \ + *_p = (*_w>>13) & 31; _p++; \ + *_p = (*_w>>18) & 31; _p++; \ + *_p = (*_w>>23) & 31; _p++; \ + break; \ + case 10: \ + *_p = (*_w) & 63; _p++; \ + *_p = (*_w>>6) & 63; _p++; \ + *_p = (*_w>>12) & 63; _p++; \ + *_p = (*_w>>18) & 31; _p++; \ + *_p = (*_w>>23) & 31; _p++; \ + break; \ + case 11: \ + *_p = (*_w) & 31; _p++; \ + *_p = (*_w>>5) & 31; _p++; \ + *_p = (*_w>>10) & 63; _p++; \ + *_p = (*_w>>16) & 63; _p++; \ + *_p = (*_w>>22) & 63; _p++; \ + break; \ + case 12: \ + *_p = (*_w) & 127; _p++; \ + *_p = (*_w>>7) & 127; _p++; \ + *_p = (*_w>>14) & 127; _p++; \ + *_p = (*_w>>21) & 127; _p++; \ + break; \ + case 13: \ + *_p = (*_w) & 1023; _p++; \ + *_p = (*_w>>10) & 511; _p++; \ + *_p = (*_w>>19) & 511; _p++; \ + break; \ + case 14: \ + *_p = (*_w) & 16383; _p++; \ + *_p = (*_w>>14) & 16383; _p++; \ + break; \ + case 15: \ + *_p = (*_w) & ((1<<28)-1); _p++; \ + break; \ + }\ + _w++; \ +} + + + + + diff --git a/ext/OPT_PFD/unpack.h b/ext/OPT_PFD/unpack.h new file mode 100644 index 0000000..fa810e9 --- /dev/null +++ b/ext/OPT_PFD/unpack.h @@ -0,0 +1,773 @@ + +/*************************************************************/ +/* macros for fast unpacking of integers of fixed bit length */ +/*************************************************************/ + +#define BS 128 + +/* supported bit lengths */ +int cnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32}; + +void unpack0(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i++) p[i] = 0; +} + + +void unpack1(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 1) + { + p[0] = (w[0] >> 31); + p[1] = (w[0] >> 30) & 1; + p[2] = (w[0] >> 29) & 1; + p[3] = (w[0] >> 28) & 1; + p[4] = (w[0] >> 27) & 1; + p[5] = (w[0] >> 26) & 1; + p[6] = (w[0] >> 25) & 1; + p[7] = (w[0] >> 24) & 1; + p[8] = (w[0] >> 23) & 1; + p[9] = (w[0] >> 22) & 1; + p[10] = (w[0] >> 21) & 1; + p[11] = (w[0] >> 20) & 1; + p[12] = (w[0] >> 19) & 1; + p[13] = (w[0] >> 18) & 1; + p[14] = (w[0] >> 17) & 1; + p[15] = (w[0] >> 16) & 1; + p[16] = (w[0] >> 15) & 1; + p[17] = (w[0] >> 14) & 1; + p[18] = (w[0] >> 13) & 1; + p[19] = (w[0] >> 12) & 1; + p[20] = (w[0] >> 11) & 1; + p[21] = (w[0] >> 10) & 1; + p[22] = (w[0] >> 9) & 1; + p[23] = (w[0] >> 8) & 1; + p[24] = (w[0] >> 7) & 1; + p[25] = (w[0] >> 6) & 1; + p[26] = (w[0] >> 5) & 1; + p[27] = (w[0] >> 4) & 1; + p[28] = (w[0] >> 3) & 1; + p[29] = (w[0] >> 2) & 1; + p[30] = (w[0] >> 1) & 1; + p[31] = (w[0]) & 1; + } +} + + +void unpack2(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 2) + { + p[0] = (w[0] >> 30); + p[1] = (w[0] >> 28) & 3; + p[2] = (w[0] >> 26) & 3; + p[3] = (w[0] >> 24) & 3; + p[4] = (w[0] >> 22) & 3; + p[5] = (w[0] >> 20) & 3; + p[6] = (w[0] >> 18) & 3; + p[7] = (w[0] >> 16) & 3; + p[8] = (w[0] >> 14) & 3; + p[9] = (w[0] >> 12) & 3; + p[10] = (w[0] >> 10) & 3; + p[11] = (w[0] >> 8) & 3; + p[12] = (w[0] >> 6) & 3; + p[13] = (w[0] >> 4) & 3; + p[14] = (w[0] >> 2) & 3; + p[15] = (w[0]) & 3; + p[16] = (w[1] >> 30); + p[17] = (w[1] >> 28) & 3; + p[18] = (w[1] >> 26) & 3; + p[19] = (w[1] >> 24) & 3; + p[20] = (w[1] >> 22) & 3; + p[21] = (w[1] >> 20) & 3; + p[22] = (w[1] >> 18) & 3; + p[23] = (w[1] >> 16) & 3; + p[24] = (w[1] >> 14) & 3; + p[25] = (w[1] >> 12) & 3; + p[26] = (w[1] >> 10) & 3; + p[27] = (w[1] >> 8) & 3; + p[28] = (w[1] >> 6) & 3; + p[29] = (w[1] >> 4) & 3; + p[30] = (w[1] >> 2) & 3; + p[31] = (w[1]) & 3; + } +} + + +void unpack3(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 3) + { + p[0] = (w[0] >> 29); + p[1] = (w[0] >> 26) & 7; + p[2] = (w[0] >> 23) & 7; + p[3] = (w[0] >> 20) & 7; + p[4] = (w[0] >> 17) & 7; + p[5] = (w[0] >> 14) & 7; + p[6] = (w[0] >> 11) & 7; + p[7] = (w[0] >> 8) & 7; + p[8] = (w[0] >> 5) & 7; + p[9] = (w[0] >> 2) & 7; + p[10] = (w[0] << 1) & 7; + p[10] |= (w[1] >> 31); + p[11] = (w[1] >> 28) & 7; + p[12] = (w[1] >> 25) & 7; + p[13] = (w[1] >> 22) & 7; + p[14] = (w[1] >> 19) & 7; + p[15] = (w[1] >> 16) & 7; + p[16] = (w[1] >> 13) & 7; + p[17] = (w[1] >> 10) & 7; + p[18] = (w[1] >> 7) & 7; + p[19] = (w[1] >> 4) & 7; + p[20] = (w[1] >> 1) & 7; + p[21] = (w[1] << 2) & 7; + p[21] |= (w[2] >> 30); + p[22] = (w[2] >> 27) & 7; + p[23] = (w[2] >> 24) & 7; + p[24] = (w[2] >> 21) & 7; + p[25] = (w[2] >> 18) & 7; + p[26] = (w[2] >> 15) & 7; + p[27] = (w[2] >> 12) & 7; + p[28] = (w[2] >> 9) & 7; + p[29] = (w[2] >> 6) & 7; + p[30] = (w[2] >> 3) & 7; + p[31] = (w[2]) & 7; + } +} + + +void unpack4(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 4) + { + p[0] = (w[0] >> 28); + p[1] = (w[0] >> 24) & 15; + p[2] = (w[0] >> 20) & 15; + p[3] = (w[0] >> 16) & 15; + p[4] = (w[0] >> 12) & 15; + p[5] = (w[0] >> 8) & 15; + p[6] = (w[0] >> 4) & 15; + p[7] = (w[0]) & 15; + p[8] = (w[1] >> 28); + p[9] = (w[1] >> 24) & 15; + p[10] = (w[1] >> 20) & 15; + p[11] = (w[1] >> 16) & 15; + p[12] = (w[1] >> 12) & 15; + p[13] = (w[1] >> 8) & 15; + p[14] = (w[1] >> 4) & 15; + p[15] = (w[1]) & 15; + p[16] = (w[2] >> 28); + p[17] = (w[2] >> 24) & 15; + p[18] = (w[2] >> 20) & 15; + p[19] = (w[2] >> 16) & 15; + p[20] = (w[2] >> 12) & 15; + p[21] = (w[2] >> 8) & 15; + p[22] = (w[2] >> 4) & 15; + p[23] = (w[2]) & 15; + p[24] = (w[3] >> 28); + p[25] = (w[3] >> 24) & 15; + p[26] = (w[3] >> 20) & 15; + p[27] = (w[3] >> 16) & 15; + p[28] = (w[3] >> 12) & 15; + p[29] = (w[3] >> 8) & 15; + p[30] = (w[3] >> 4) & 15; + p[31] = (w[3]) & 15; + } +} + + +void unpack5(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 5) + { + p[0] = (w[0] >> 27); + p[1] = (w[0] >> 22) & 31; + p[2] = (w[0] >> 17) & 31; + p[3] = (w[0] >> 12) & 31; + p[4] = (w[0] >> 7) & 31; + p[5] = (w[0] >> 2) & 31; + p[6] = (w[0] << 3) & 31; + p[6] |= (w[1] >> 29); + p[7] = (w[1] >> 24) & 31; + p[8] = (w[1] >> 19) & 31; + p[9] = (w[1] >> 14) & 31; + p[10] = (w[1] >> 9) & 31; + p[11] = (w[1] >> 4) & 31; + p[12] = (w[1] << 1) & 31; + p[12] |= (w[2] >> 31); + p[13] = (w[2] >> 26) & 31; + p[14] = (w[2] >> 21) & 31; + p[15] = (w[2] >> 16) & 31; + p[16] = (w[2] >> 11) & 31; + p[17] = (w[2] >> 6) & 31; + p[18] = (w[2] >> 1) & 31; + p[19] = (w[2] << 4) & 31; + p[19] |= (w[3] >> 28); + p[20] = (w[3] >> 23) & 31; + p[21] = (w[3] >> 18) & 31; + p[22] = (w[3] >> 13) & 31; + p[23] = (w[3] >> 8) & 31; + p[24] = (w[3] >> 3) & 31; + p[25] = (w[3] << 2) & 31; + p[25] |= (w[4] >> 30); + p[26] = (w[4] >> 25) & 31; + p[27] = (w[4] >> 20) & 31; + p[28] = (w[4] >> 15) & 31; + p[29] = (w[4] >> 10) & 31; + p[30] = (w[4] >> 5) & 31; + p[31] = (w[4]) & 31; + } +} + + +void unpack6(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 6) + { + p[0] = (w[0] >> 26); + p[1] = (w[0] >> 20) & 63; + p[2] = (w[0] >> 14) & 63; + p[3] = (w[0] >> 8) & 63; + p[4] = (w[0] >> 2) & 63; + p[5] = (w[0] << 4) & 63; + p[5] |= (w[1] >> 28); + p[6] = (w[1] >> 22) & 63; + p[7] = (w[1] >> 16) & 63; + p[8] = (w[1] >> 10) & 63; + p[9] = (w[1] >> 4) & 63; + p[10] = (w[1] << 2) & 63; + p[10] |= (w[2] >> 30); + p[11] = (w[2] >> 24) & 63; + p[12] = (w[2] >> 18) & 63; + p[13] = (w[2] >> 12) & 63; + p[14] = (w[2] >> 6) & 63; + p[15] = (w[2]) & 63; + p[16] = (w[3] >> 26); + p[17] = (w[3] >> 20) & 63; + p[18] = (w[3] >> 14) & 63; + p[19] = (w[3] >> 8) & 63; + p[20] = (w[3] >> 2) & 63; + p[21] = (w[3] << 4) & 63; + p[21] |= (w[4] >> 28); + p[22] = (w[4] >> 22) & 63; + p[23] = (w[4] >> 16) & 63; + p[24] = (w[4] >> 10) & 63; + p[25] = (w[4] >> 4) & 63; + p[26] = (w[4] << 2) & 63; + p[26] |= (w[5] >> 30); + p[27] = (w[5] >> 24) & 63; + p[28] = (w[5] >> 18) & 63; + p[29] = (w[5] >> 12) & 63; + p[30] = (w[5] >> 6) & 63; + p[31] = (w[5]) & 63; + } +} + + +void unpack7(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 7) + { + p[0] = (w[0] >> 25); + p[1] = (w[0] >> 18) & 127; + p[2] = (w[0] >> 11) & 127; + p[3] = (w[0] >> 4) & 127; + p[4] = (w[0] << 3) & 127; + p[4] |= (w[1] >> 29); + p[5] = (w[1] >> 22) & 127; + p[6] = (w[1] >> 15) & 127; + p[7] = (w[1] >> 8) & 127; + p[8] = (w[1] >> 1) & 127; + p[9] = (w[1] << 6) & 127; + p[9] |= (w[2] >> 26); + p[10] = (w[2] >> 19) & 127; + p[11] = (w[2] >> 12) & 127; + p[12] = (w[2] >> 5) & 127; + p[13] = (w[2] << 2) & 127; + p[13] |= (w[3] >> 30); + p[14] = (w[3] >> 23) & 127; + p[15] = (w[3] >> 16) & 127; + p[16] = (w[3] >> 9) & 127; + p[17] = (w[3] >> 2) & 127; + p[18] = (w[3] << 5) & 127; + p[18] |= (w[4] >> 27); + p[19] = (w[4] >> 20) & 127; + p[20] = (w[4] >> 13) & 127; + p[21] = (w[4] >> 6) & 127; + p[22] = (w[4] << 1) & 127; + p[22] |= (w[5] >> 31); + p[23] = (w[5] >> 24) & 127; + p[24] = (w[5] >> 17) & 127; + p[25] = (w[5] >> 10) & 127; + p[26] = (w[5] >> 3) & 127; + p[27] = (w[5] << 4) & 127; + p[27] |= (w[6] >> 28); + p[28] = (w[6] >> 21) & 127; + p[29] = (w[6] >> 14) & 127; + p[30] = (w[6] >> 7) & 127; + p[31] = (w[6]) & 127; + } +} + + +void unpack8(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 8) + { + p[0] = (w[0] >> 24); + p[1] = (w[0] >> 16) & 255; + p[2] = (w[0] >> 8) & 255; + p[3] = (w[0]) & 255; + p[4] = (w[1] >> 24); + p[5] = (w[1] >> 16) & 255; + p[6] = (w[1] >> 8) & 255; + p[7] = (w[1]) & 255; + p[8] = (w[2] >> 24); + p[9] = (w[2] >> 16) & 255; + p[10] = (w[2] >> 8) & 255; + p[11] = (w[2]) & 255; + p[12] = (w[3] >> 24); + p[13] = (w[3] >> 16) & 255; + p[14] = (w[3] >> 8) & 255; + p[15] = (w[3]) & 255; + p[16] = (w[4] >> 24); + p[17] = (w[4] >> 16) & 255; + p[18] = (w[4] >> 8) & 255; + p[19] = (w[4]) & 255; + p[20] = (w[5] >> 24); + p[21] = (w[5] >> 16) & 255; + p[22] = (w[5] >> 8) & 255; + p[23] = (w[5]) & 255; + p[24] = (w[6] >> 24); + p[25] = (w[6] >> 16) & 255; + p[26] = (w[6] >> 8) & 255; + p[27] = (w[6]) & 255; + p[28] = (w[7] >> 24); + p[29] = (w[7] >> 16) & 255; + p[30] = (w[7] >> 8) & 255; + p[31] = (w[7]) & 255; + } +} + + +void unpack9(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 9) + { + p[0] = (w[0] >> 23); + p[1] = (w[0] >> 14) & 511; + p[2] = (w[0] >> 5) & 511; + p[3] = (w[0] << 4) & 511; + p[3] |= (w[1] >> 28); + p[4] = (w[1] >> 19) & 511; + p[5] = (w[1] >> 10) & 511; + p[6] = (w[1] >> 1) & 511; + p[7] = (w[1] << 8) & 511; + p[7] |= (w[2] >> 24); + p[8] = (w[2] >> 15) & 511; + p[9] = (w[2] >> 6) & 511; + p[10] = (w[2] << 3) & 511; + p[10] |= (w[3] >> 29); + p[11] = (w[3] >> 20) & 511; + p[12] = (w[3] >> 11) & 511; + p[13] = (w[3] >> 2) & 511; + p[14] = (w[3] << 7) & 511; + p[14] |= (w[4] >> 25); + p[15] = (w[4] >> 16) & 511; + p[16] = (w[4] >> 7) & 511; + p[17] = (w[4] << 2) & 511; + p[17] |= (w[5] >> 30); + p[18] = (w[5] >> 21) & 511; + p[19] = (w[5] >> 12) & 511; + p[20] = (w[5] >> 3) & 511; + p[21] = (w[5] << 6) & 511; + p[21] |= (w[6] >> 26); + p[22] = (w[6] >> 17) & 511; + p[23] = (w[6] >> 8) & 511; + p[24] = (w[6] << 1) & 511; + p[24] |= (w[7] >> 31); + p[25] = (w[7] >> 22) & 511; + p[26] = (w[7] >> 13) & 511; + p[27] = (w[7] >> 4) & 511; + p[28] = (w[7] << 5) & 511; + p[28] |= (w[8] >> 27); + p[29] = (w[8] >> 18) & 511; + p[30] = (w[8] >> 9) & 511; + p[31] = (w[8]) & 511; + } +} + + +void unpack10(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 10) + { + p[0] = (w[0] >> 22); + p[1] = (w[0] >> 12) & 1023; + p[2] = (w[0] >> 2) & 1023; + p[3] = (w[0] << 8) & 1023; + p[3] |= (w[1] >> 24); + p[4] = (w[1] >> 14) & 1023; + p[5] = (w[1] >> 4) & 1023; + p[6] = (w[1] << 6) & 1023; + p[6] |= (w[2] >> 26); + p[7] = (w[2] >> 16) & 1023; + p[8] = (w[2] >> 6) & 1023; + p[9] = (w[2] << 4) & 1023; + p[9] |= (w[3] >> 28); + p[10] = (w[3] >> 18) & 1023; + p[11] = (w[3] >> 8) & 1023; + p[12] = (w[3] << 2) & 1023; + p[12] |= (w[4] >> 30); + p[13] = (w[4] >> 20) & 1023; + p[14] = (w[4] >> 10) & 1023; + p[15] = (w[4]) & 1023; + p[16] = (w[5] >> 22); + p[17] = (w[5] >> 12) & 1023; + p[18] = (w[5] >> 2) & 1023; + p[19] = (w[5] << 8) & 1023; + p[19] |= (w[6] >> 24); + p[20] = (w[6] >> 14) & 1023; + p[21] = (w[6] >> 4) & 1023; + p[22] = (w[6] << 6) & 1023; + p[22] |= (w[7] >> 26); + p[23] = (w[7] >> 16) & 1023; + p[24] = (w[7] >> 6) & 1023; + p[25] = (w[7] << 4) & 1023; + p[25] |= (w[8] >> 28); + p[26] = (w[8] >> 18) & 1023; + p[27] = (w[8] >> 8) & 1023; + p[28] = (w[8] << 2) & 1023; + p[28] |= (w[9] >> 30); + p[29] = (w[9] >> 20) & 1023; + p[30] = (w[9] >> 10) & 1023; + p[31] = (w[9]) & 1023; + } +} + + +void unpack11(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 11) + { + p[0] = (w[0] >> 21); + p[1] = (w[0] >> 10) & 2047; + p[2] = (w[0] << 1) & 2047; + p[2] |= (w[1] >> 31); + p[3] = (w[1] >> 20) & 2047; + p[4] = (w[1] >> 9) & 2047; + p[5] = (w[1] << 2) & 2047; + p[5] |= (w[2] >> 30); + p[6] = (w[2] >> 19) & 2047; + p[7] = (w[2] >> 8) & 2047; + p[8] = (w[2] << 3) & 2047; + p[8] |= (w[3] >> 29); + p[9] = (w[3] >> 18) & 2047; + p[10] = (w[3] >> 7) & 2047; + p[11] = (w[3] << 4) & 2047; + p[11] |= (w[4] >> 28); + p[12] = (w[4] >> 17) & 2047; + p[13] = (w[4] >> 6) & 2047; + p[14] = (w[4] << 5) & 2047; + p[14] |= (w[5] >> 27); + p[15] = (w[5] >> 16) & 2047; + p[16] = (w[5] >> 5) & 2047; + p[17] = (w[5] << 6) & 2047; + p[17] |= (w[6] >> 26); + p[18] = (w[6] >> 15) & 2047; + p[19] = (w[6] >> 4) & 2047; + p[20] = (w[6] << 7) & 2047; + p[20] |= (w[7] >> 25); + p[21] = (w[7] >> 14) & 2047; + p[22] = (w[7] >> 3) & 2047; + p[23] = (w[7] << 8) & 2047; + p[23] |= (w[8] >> 24); + p[24] = (w[8] >> 13) & 2047; + p[25] = (w[8] >> 2) & 2047; + p[26] = (w[8] << 9) & 2047; + p[26] |= (w[9] >> 23); + p[27] = (w[9] >> 12) & 2047; + p[28] = (w[9] >> 1) & 2047; + p[29] = (w[9] << 10) & 2047; + p[29] |= (w[10] >> 22); + p[30] = (w[10] >> 11) & 2047; + p[31] = (w[10]) & 2047; + } +} + + +void unpack12(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 12) + { + p[0] = (w[0] >> 20); + p[1] = (w[0] >> 8) & 4095; + p[2] = (w[0] << 4) & 4095; + p[2] |= (w[1] >> 28); + p[3] = (w[1] >> 16) & 4095; + p[4] = (w[1] >> 4) & 4095; + p[5] = (w[1] << 8) & 4095; + p[5] |= (w[2] >> 24); + p[6] = (w[2] >> 12) & 4095; + p[7] = (w[2]) & 4095; + p[8] = (w[3] >> 20); + p[9] = (w[3] >> 8) & 4095; + p[10] = (w[3] << 4) & 4095; + p[10] |= (w[4] >> 28); + p[11] = (w[4] >> 16) & 4095; + p[12] = (w[4] >> 4) & 4095; + p[13] = (w[4] << 8) & 4095; + p[13] |= (w[5] >> 24); + p[14] = (w[5] >> 12) & 4095; + p[15] = (w[5]) & 4095; + p[16] = (w[6] >> 20); + p[17] = (w[6] >> 8) & 4095; + p[18] = (w[6] << 4) & 4095; + p[18] |= (w[7] >> 28); + p[19] = (w[7] >> 16) & 4095; + p[20] = (w[7] >> 4) & 4095; + p[21] = (w[7] << 8) & 4095; + p[21] |= (w[8] >> 24); + p[22] = (w[8] >> 12) & 4095; + p[23] = (w[8]) & 4095; + p[24] = (w[9] >> 20); + p[25] = (w[9] >> 8) & 4095; + p[26] = (w[9] << 4) & 4095; + p[26] |= (w[10] >> 28); + p[27] = (w[10] >> 16) & 4095; + p[28] = (w[10] >> 4) & 4095; + p[29] = (w[10] << 8) & 4095; + p[29] |= (w[11] >> 24); + p[30] = (w[11] >> 12) & 4095; + p[31] = (w[11]) & 4095; + } +} + + +void unpack13(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 13) + { + p[0] = (w[0] >> 19); + p[1] = (w[0] >> 6) & 8191; + p[2] = (w[0] << 7) & 8191; + p[2] |= (w[1] >> 25); + p[3] = (w[1] >> 12) & 8191; + p[4] = (w[1] << 1) & 8191; + p[4] |= (w[2] >> 31); + p[5] = (w[2] >> 18) & 8191; + p[6] = (w[2] >> 5) & 8191; + p[7] = (w[2] << 8) & 8191; + p[7] |= (w[3] >> 24); + p[8] = (w[3] >> 11) & 8191; + p[9] = (w[3] << 2) & 8191; + p[9] |= (w[4] >> 30); + p[10] = (w[4] >> 17) & 8191; + p[11] = (w[4] >> 4) & 8191; + p[12] = (w[4] << 9) & 8191; + p[12] |= (w[5] >> 23); + p[13] = (w[5] >> 10) & 8191; + p[14] = (w[5] << 3) & 8191; + p[14] |= (w[6] >> 29); + p[15] = (w[6] >> 16) & 8191; + p[16] = (w[6] >> 3) & 8191; + p[17] = (w[6] << 10) & 8191; + p[17] |= (w[7] >> 22); + p[18] = (w[7] >> 9) & 8191; + p[19] = (w[7] << 4) & 8191; + p[19] |= (w[8] >> 28); + p[20] = (w[8] >> 15) & 8191; + p[21] = (w[8] >> 2) & 8191; + p[22] = (w[8] << 11) & 8191; + p[22] |= (w[9] >> 21); + p[23] = (w[9] >> 8) & 8191; + p[24] = (w[9] << 5) & 8191; + p[24] |= (w[10] >> 27); + p[25] = (w[10] >> 14) & 8191; + p[26] = (w[10] >> 1) & 8191; + p[27] = (w[10] << 12) & 8191; + p[27] |= (w[11] >> 20); + p[28] = (w[11] >> 7) & 8191; + p[29] = (w[11] << 6) & 8191; + p[29] |= (w[12] >> 26); + p[30] = (w[12] >> 13) & 8191; + p[31] = (w[12]) & 8191; + } +} + + +void unpack16(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 16) + { + p[0] = (w[0] >> 16); + p[1] = (w[0]) & 65535; + p[2] = (w[1] >> 16); + p[3] = (w[1]) & 65535; + p[4] = (w[2] >> 16); + p[5] = (w[2]) & 65535; + p[6] = (w[3] >> 16); + p[7] = (w[3]) & 65535; + p[8] = (w[4] >> 16); + p[9] = (w[4]) & 65535; + p[10] = (w[5] >> 16); + p[11] = (w[5]) & 65535; + p[12] = (w[6] >> 16); + p[13] = (w[6]) & 65535; + p[14] = (w[7] >> 16); + p[15] = (w[7]) & 65535; + p[16] = (w[8] >> 16); + p[17] = (w[8]) & 65535; + p[18] = (w[9] >> 16); + p[19] = (w[9]) & 65535; + p[20] = (w[10] >> 16); + p[21] = (w[10]) & 65535; + p[22] = (w[11] >> 16); + p[23] = (w[11]) & 65535; + p[24] = (w[12] >> 16); + p[25] = (w[12]) & 65535; + p[26] = (w[13] >> 16); + p[27] = (w[13]) & 65535; + p[28] = (w[14] >> 16); + p[29] = (w[14]) & 65535; + p[30] = (w[15] >> 16); + p[31] = (w[15]) & 65535; + } +} + + +void unpack20(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 20) + { + p[0] = (w[0] >> 12); + p[1] = (w[0] << 8) & ((1<<20)-1); + p[1] |= (w[1] >> 24); + p[2] = (w[1] >> 4) & ((1<<20)-1); + p[3] = (w[1] << 16) & ((1<<20)-1); + p[3] |= (w[2] >> 16); + p[4] = (w[2] << 4) & ((1<<20)-1); + p[4] |= (w[3] >> 28); + p[5] = (w[3] >> 8) & ((1<<20)-1); + p[6] = (w[3] << 12) & ((1<<20)-1); + p[6] |= (w[4] >> 20); + p[7] = (w[4]) & ((1<<20)-1); + p[8] = (w[5] >> 12); + p[9] = (w[5] << 8) & ((1<<20)-1); + p[9] |= (w[6] >> 24); + p[10] = (w[6] >> 4) & ((1<<20)-1); + p[11] = (w[6] << 16) & ((1<<20)-1); + p[11] |= (w[7] >> 16); + p[12] = (w[7] << 4) & ((1<<20)-1); + p[12] |= (w[8] >> 28); + p[13] = (w[8] >> 8) & ((1<<20)-1); + p[14] = (w[8] << 12) & ((1<<20)-1); + p[14] |= (w[9] >> 20); + p[15] = (w[9]) & ((1<<20)-1); + p[16] = (w[10] >> 12); + p[17] = (w[10] << 8) & ((1<<20)-1); + p[17] |= (w[11] >> 24); + p[18] = (w[11] >> 4) & ((1<<20)-1); + p[19] = (w[11] << 16) & ((1<<20)-1); + p[19] |= (w[12] >> 16); + p[20] = (w[12] << 4) & ((1<<20)-1); + p[20] |= (w[13] >> 28); + p[21] = (w[13] >> 8) & ((1<<20)-1); + p[22] = (w[13] << 12) & ((1<<20)-1); + p[22] |= (w[14] >> 20); + p[23] = (w[14]) & ((1<<20)-1); + p[24] = (w[15] >> 12); + p[25] = (w[15] << 8) & ((1<<20)-1); + p[25] |= (w[16] >> 24); + p[26] = (w[16] >> 4) & ((1<<20)-1); + p[27] = (w[16] << 16) & ((1<<20)-1); + p[27] |= (w[17] >> 16); + p[28] = (w[17] << 4) & ((1<<20)-1); + p[28] |= (w[18] >> 28); + p[29] = (w[18] >> 8) & ((1<<20)-1); + p[30] = (w[18] << 12) & ((1<<20)-1); + p[30] |= (w[19] >> 20); + p[31] = (w[19]) & ((1<<20)-1); + } +} + + +void unpack32(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 32) + { + p[0] = w[0]; + p[1] = w[1]; + p[2] = w[2]; + p[3] = w[3]; + p[4] = w[4]; + p[5] = w[5]; + p[6] = w[6]; + p[7] = w[7]; + p[8] = w[8]; + p[9] = w[9]; + p[10] = w[10]; + p[11] = w[11]; + p[12] = w[12]; + p[13] = w[13]; + p[14] = w[14]; + p[15] = w[15]; + p[16] = w[16]; + p[17] = w[17]; + p[18] = w[18]; + p[19] = w[19]; + p[20] = w[20]; + p[21] = w[21]; + p[22] = w[22]; + p[23] = w[23]; + p[24] = w[24]; + p[25] = w[25]; + p[26] = w[26]; + p[27] = w[27]; + p[28] = w[28]; + p[29] = w[29]; + p[30] = w[30]; + p[31] = w[31]; + } +} + + +typedef void (*pf)(unsigned int *p, unsigned int *w); +pf unpack[17] = {unpack0, unpack1, unpack2, unpack3, unpack4, unpack5, + unpack6, unpack7, unpack8, unpack9, unpack10, unpack11, + unpack12, unpack13, unpack16, unpack20, unpack32}; + diff --git a/ext/simdcomp/bitpacka.c b/ext/simdcomp/bitpacka.c new file mode 100644 index 0000000..974237a --- /dev/null +++ b/ext/simdcomp/bitpacka.c @@ -0,0 +1,17773 @@ +#include "bitpacka.h" +#define INLINE inline +uint32_t * nullpacker(const uint32_t * __restrict in, uint32_t * __restrict out) { + return out; +} + + const uint32_t * nullunpacker8(const uint32_t * __restrict in, uint32_t * __restrict out) { + memset(out,0,8 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in++) ; + *out |= ( (*in++) ) << 1 ; + *out |= ( (*in++) ) << 2 ; + *out |= ( (*in++) ) << 3 ; + *out |= ( (*in++) ) << 4 ; + *out |= ( (*in++) ) << 5 ; + *out |= ( (*in++) ) << 6 ; + *out |= ( (*in++) ) << 7 ; + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in++) ; + *out |= ( (*in++) ) << 2 ; + *out |= ( (*in++) ) << 4 ; + *out |= ( (*in++) ) << 6 ; + *out |= ( (*in++) ) << 8 ; + *out |= ( (*in++) ) << 10 ; + *out |= ( (*in++) ) << 12 ; + *out |= ( (*in++) ) << 14 ; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask3_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask7_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask11_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask15_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask19_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask23_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask27_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask31_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + +#if 0 +#define OUTI(__x) *out++ +#define OUT(__x) *out +#define OUI out++ +#else +#define OUTI(__x) out[__x] +#define OUT(__x) out[__x] +#define OUI +#endif +const INLINE uint32_t * __fastunpack1_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) & 1; + OUTI( 1) = ( (*in) >> 1 ) & 1; + OUTI( 2) = ( (*in) >> 2 ) & 1; + OUTI( 3) = ( (*in) >> 3 ) & 1; + OUTI( 4) = ( (*in) >> 4 ) & 1; + OUTI( 5) = ( (*in) >> 5 ) & 1; + OUTI( 6) = ( (*in) >> 6 ) & 1; + OUTI( 7) = ( (*in) >> 7 ) & 1; + return in + 1; +} + +const INLINE uint32_t * __fastunpack2_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; + OUTI( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; + OUTI( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; + OUTI( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; + OUTI( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; + OUTI( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; + OUTI( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; + OUTI( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack3_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; + OUTI( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; + OUTI( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; + OUTI( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; + OUTI( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; + OUTI( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; + OUTI( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; + OUTI( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack4_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; + OUTI( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; + OUTI( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; + OUTI( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; + OUTI( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; + OUTI( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; + OUTI( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; + OUTI( 7) = ( (*in++) >> 28 ) ; + return in; +} + +const uint32_t * __fastunpack5_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; + OUTI( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; + OUTI( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; + OUTI( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; + OUTI( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; + OUTI( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; + OUT( 6) = ( (*in++) >> 30 ) ; + OUT( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + OUI; + OUTI( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack6_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; + OUTI( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; + OUTI( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; + OUTI( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; + OUTI( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; + OUT( 5) = ( (*in++) >> 30 ) ; + OUT( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + OUI; + OUTI( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; + OUTI( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack7_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; + OUTI( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; + OUTI( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; + OUTI( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; + OUT( 4) = ( (*in++) >> 28 ) ; + OUT( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + OUI; + OUTI( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; + OUTI( 6 ) = ( (*in) >> 10 ) % (1U << 7 ) ; + OUTI( 7 ) = ( (*in) >> 17 ) % (1U << 7 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack8_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; + OUTI( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; + OUTI( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; + OUTI( 3) = ( (*in++) >> 24 ) ; + OUTI( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; + OUTI( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; + OUTI( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; + OUTI( 7) = ( (*in++) >> 24 ) ; + return in; +} + +const INLINE uint32_t * __fastunpack9_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; + OUTI( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; + OUTI( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; + OUT( 3) = ( (*in++) >> 27 ) ; + OUT( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + OUI; + OUTI( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; + OUTI( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; + OUTI( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; + OUT( 7) = ( (*in++) >> 31 ) ; + OUT( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + OUI; + return in + 1; +} + +const INLINE uint32_t * __fastunpack10_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; + OUTI( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; + OUTI( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; + OUT( 3) = ( (*in++) >> 30 ) ; + OUT( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + OUI; + OUTI( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; + OUTI( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; + OUT( 6) = ( (*in++) >> 28 ) ; + OUT( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + OUI; + OUTI( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack11_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ((*in) >> 0 ) % (1U << 11 ) ; + OUTI( 1) = ((*in) >> 11 ) % (1U << 11 ) ; + OUT( 2) = ((*in++) >> 22 ) ; + OUT( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + OUI; + OUTI( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; + OUTI( 4) = ((*in) >> 12 ) % (1U << 11 ) ; + OUT( 5) = (*in++) >> 23; + OUT( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + OUI; + OUTI( 6) = ((*in) >> 2 ) % (1U << 11 ) ; + OUTI( 7) = ((*in) >> 13 ) % (1U << 11 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack12_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; + OUTI( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; + OUT( 2) = ( (*in++) >> 24 ) ; + OUT( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + OUI; + OUTI( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; + OUTI( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; + OUT( 5) = ( (*in++) >> 28 ) ; + OUT( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + OUI; + OUTI( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; + OUTI( 7) = ( (*in++) >> 20 ) ; + return in; +} + +const INLINE uint32_t * __fastunpack13_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; + OUTI( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; + OUT( 2) = ( (*in++) >> 26 ) ; + OUT( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + OUI; + OUTI( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; + OUT( 4) = ( (*in++) >> 20 ) ; + OUT( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + OUI; + OUTI( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; + OUTI( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; + OUT( 7) = ( (*in++) >> 27 ); + OUT( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + OUI; + return in + 1; +} + +const INLINE uint32_t * __fastunpack14_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + *out++ = ( (*in) >> 0 ) % (1U << 14 ) ; + *out++ = ( (*in) >> 14 ) % (1U << 14 ) ; + *out = ( (*in++) >> 28 ) ; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out++ = ( (*in) >> 10 ) % (1U << 14 ) ; + *out = ( (*in++) >> 24 ) ; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out++ = ( (*in) >> 6 ) % (1U << 14 ) ; + *out = ( (*in++) >> 20 ) ; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out++ = ( (*in) >> 2 ) % (1U << 14 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack15_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack16_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const INLINE uint32_t * __fastunpack17_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack18_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack19_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack20_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const INLINE uint32_t * __fastunpack21_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack22_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack23_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack27_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack31_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_8(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker8(in,out); + + case 1: + return __fastunpack1_8(in,out); + + case 2: + return __fastunpack2_8(in,out); + + case 3: + return __fastunpack3_8(in,out); + + case 4: + return __fastunpack4_8(in,out); + + case 5: + return __fastunpack5_8(in,out); + + case 6: + return __fastunpack6_8(in,out); + + case 7: + return __fastunpack7_8(in,out); + + case 8: + return __fastunpack8_8(in,out); + + case 9: + return __fastunpack9_8(in,out); + + case 10: + return __fastunpack10_8(in,out); + + case 11: + return __fastunpack11_8(in,out); + + case 12: + return __fastunpack12_8(in,out); + + case 13: + return __fastunpack13_8(in,out); + + case 14: + return __fastunpack14_8(in,out); + + case 15: + return __fastunpack15_8(in,out); + + case 16: + return __fastunpack16_8(in,out); + + case 17: + return __fastunpack17_8(in,out); + + case 18: + return __fastunpack18_8(in,out); + + case 19: + return __fastunpack19_8(in,out); + + case 20: + return __fastunpack20_8(in,out); + + case 21: + return __fastunpack21_8(in,out); + + case 22: + return __fastunpack22_8(in,out); + + case 23: + return __fastunpack23_8(in,out); + + case 24: + return __fastunpack24_8(in,out); + + case 25: + return __fastunpack25_8(in,out); + + case 26: + return __fastunpack26_8(in,out); + + case 27: + return __fastunpack27_8(in,out); + + case 28: + return __fastunpack28_8(in,out); + + case 29: + return __fastunpack29_8(in,out); + + case 30: + return __fastunpack30_8(in,out); + + case 31: + return __fastunpack31_8(in,out); + + case 32: + return __fastunpack32_8(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_8(in,out); + + case 2: + return __fastpackwithoutmask2_8(in,out); + + case 3: + return __fastpackwithoutmask3_8(in,out); + + case 4: + return __fastpackwithoutmask4_8(in,out); + + case 5: + return __fastpackwithoutmask5_8(in,out); + + case 6: + return __fastpackwithoutmask6_8(in,out); + + case 7: + return __fastpackwithoutmask7_8(in,out); + + case 8: + return __fastpackwithoutmask8_8(in,out); + + case 9: + return __fastpackwithoutmask9_8(in,out); + + case 10: + return __fastpackwithoutmask10_8(in,out); + + case 11: + return __fastpackwithoutmask11_8(in,out); + + case 12: + return __fastpackwithoutmask12_8(in,out); + + case 13: + return __fastpackwithoutmask13_8(in,out); + + case 14: + return __fastpackwithoutmask14_8(in,out); + + case 15: + return __fastpackwithoutmask15_8(in,out); + + case 16: + return __fastpackwithoutmask16_8(in,out); + + case 17: + return __fastpackwithoutmask17_8(in,out); + + case 18: + return __fastpackwithoutmask18_8(in,out); + + case 19: + return __fastpackwithoutmask19_8(in,out); + + case 20: + return __fastpackwithoutmask20_8(in,out); + + case 21: + return __fastpackwithoutmask21_8(in,out); + + case 22: + return __fastpackwithoutmask22_8(in,out); + + case 23: + return __fastpackwithoutmask23_8(in,out); + + case 24: + return __fastpackwithoutmask24_8(in,out); + + case 25: + return __fastpackwithoutmask25_8(in,out); + + case 26: + return __fastpackwithoutmask26_8(in,out); + + case 27: + return __fastpackwithoutmask27_8(in,out); + + case 28: + return __fastpackwithoutmask28_8(in,out); + + case 29: + return __fastpackwithoutmask29_8(in,out); + + case 30: + return __fastpackwithoutmask30_8(in,out); + + case 31: + return __fastpackwithoutmask31_8(in,out); + + case 32: + return __fastpackwithoutmask32_8(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker16(const uint32_t * __restrict in, uint32_t * __restrict out) { + memset(out,0,16 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask3_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask7_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask11_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask15_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask19_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask23_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask27_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask31_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + + + + +const uint32_t * __fastunpack1_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack2_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack3_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack4_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack5_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack6_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack7_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack8_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack9_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack10_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack11_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack12_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack13_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack14_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack15_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack16_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack17_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack18_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack19_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack20_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack22_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack23_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack27_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack31_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_16(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker16(in,out); + + case 1: + return __fastunpack1_16(in,out); + + case 2: + return __fastunpack2_16(in,out); + + case 3: + return __fastunpack3_16(in,out); + + case 4: + return __fastunpack4_16(in,out); + + case 5: + return __fastunpack5_16(in,out); + + case 6: + return __fastunpack6_16(in,out); + + case 7: + return __fastunpack7_16(in,out); + + case 8: + return __fastunpack8_16(in,out); + + case 9: + return __fastunpack9_16(in,out); + + case 10: + return __fastunpack10_16(in,out); + + case 11: + return __fastunpack11_16(in,out); + + case 12: + return __fastunpack12_16(in,out); + + case 13: + return __fastunpack13_16(in,out); + + case 14: + return __fastunpack14_16(in,out); + + case 15: + return __fastunpack15_16(in,out); + + case 16: + return __fastunpack16_16(in,out); + + case 17: + return __fastunpack17_16(in,out); + + case 18: + return __fastunpack18_16(in,out); + + case 19: + return __fastunpack19_16(in,out); + + case 20: + return __fastunpack20_16(in,out); + + case 21: + return __fastunpack21_16(in,out); + + case 22: + return __fastunpack22_16(in,out); + + case 23: + return __fastunpack23_16(in,out); + + case 24: + return __fastunpack24_16(in,out); + + case 25: + return __fastunpack25_16(in,out); + + case 26: + return __fastunpack26_16(in,out); + + case 27: + return __fastunpack27_16(in,out); + + case 28: + return __fastunpack28_16(in,out); + + case 29: + return __fastunpack29_16(in,out); + + case 30: + return __fastunpack30_16(in,out); + + case 31: + return __fastunpack31_16(in,out); + + case 32: + return __fastunpack32_16(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_16(in,out); + + case 2: + return __fastpackwithoutmask2_16(in,out); + + case 3: + return __fastpackwithoutmask3_16(in,out); + + case 4: + return __fastpackwithoutmask4_16(in,out); + + case 5: + return __fastpackwithoutmask5_16(in,out); + + case 6: + return __fastpackwithoutmask6_16(in,out); + + case 7: + return __fastpackwithoutmask7_16(in,out); + + case 8: + return __fastpackwithoutmask8_16(in,out); + + case 9: + return __fastpackwithoutmask9_16(in,out); + + case 10: + return __fastpackwithoutmask10_16(in,out); + + case 11: + return __fastpackwithoutmask11_16(in,out); + + case 12: + return __fastpackwithoutmask12_16(in,out); + + case 13: + return __fastpackwithoutmask13_16(in,out); + + case 14: + return __fastpackwithoutmask14_16(in,out); + + case 15: + return __fastpackwithoutmask15_16(in,out); + + case 16: + return __fastpackwithoutmask16_16(in,out); + + case 17: + return __fastpackwithoutmask17_16(in,out); + + case 18: + return __fastpackwithoutmask18_16(in,out); + + case 19: + return __fastpackwithoutmask19_16(in,out); + + case 20: + return __fastpackwithoutmask20_16(in,out); + + case 21: + return __fastpackwithoutmask21_16(in,out); + + case 22: + return __fastpackwithoutmask22_16(in,out); + + case 23: + return __fastpackwithoutmask23_16(in,out); + + case 24: + return __fastpackwithoutmask24_16(in,out); + + case 25: + return __fastpackwithoutmask25_16(in,out); + + case 26: + return __fastpackwithoutmask26_16(in,out); + + case 27: + return __fastpackwithoutmask27_16(in,out); + + case 28: + return __fastpackwithoutmask28_16(in,out); + + case 29: + return __fastpackwithoutmask29_16(in,out); + + case 30: + return __fastpackwithoutmask30_16(in,out); + + case 31: + return __fastpackwithoutmask31_16(in,out); + + case 32: + return __fastpackwithoutmask32_16(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker24(const uint32_t * __restrict in, uint32_t * __restrict out) { + memset(out,0,24 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask3_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask7_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask11_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask15_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask19_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask23_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask27_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask31_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + + + + +const uint32_t * __fastunpack1_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + *out = ( (*in) >> 16 ) & 1 ; + out++; + *out = ( (*in) >> 17 ) & 1 ; + out++; + *out = ( (*in) >> 18 ) & 1 ; + out++; + *out = ( (*in) >> 19 ) & 1 ; + out++; + *out = ( (*in) >> 20 ) & 1 ; + out++; + *out = ( (*in) >> 21 ) & 1 ; + out++; + *out = ( (*in) >> 22 ) & 1 ; + out++; + *out = ( (*in) >> 23 ) & 1 ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack2_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack3_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 3 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack4_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack5_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 5 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack6_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack7_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 7 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack8_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack9_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 9 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack10_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack11_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack12_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack13_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 13 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack14_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack15_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack16_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack17_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 17 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack18_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack19_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack20_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack22_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack23_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack27_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack31_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_24(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker24(in,out); + + case 1: + return __fastunpack1_24(in,out); + + case 2: + return __fastunpack2_24(in,out); + + case 3: + return __fastunpack3_24(in,out); + + case 4: + return __fastunpack4_24(in,out); + + case 5: + return __fastunpack5_24(in,out); + + case 6: + return __fastunpack6_24(in,out); + + case 7: + return __fastunpack7_24(in,out); + + case 8: + return __fastunpack8_24(in,out); + + case 9: + return __fastunpack9_24(in,out); + + case 10: + return __fastunpack10_24(in,out); + + case 11: + return __fastunpack11_24(in,out); + + case 12: + return __fastunpack12_24(in,out); + + case 13: + return __fastunpack13_24(in,out); + + case 14: + return __fastunpack14_24(in,out); + + case 15: + return __fastunpack15_24(in,out); + + case 16: + return __fastunpack16_24(in,out); + + case 17: + return __fastunpack17_24(in,out); + + case 18: + return __fastunpack18_24(in,out); + + case 19: + return __fastunpack19_24(in,out); + + case 20: + return __fastunpack20_24(in,out); + + case 21: + return __fastunpack21_24(in,out); + + case 22: + return __fastunpack22_24(in,out); + + case 23: + return __fastunpack23_24(in,out); + + case 24: + return __fastunpack24_24(in,out); + + case 25: + return __fastunpack25_24(in,out); + + case 26: + return __fastunpack26_24(in,out); + + case 27: + return __fastunpack27_24(in,out); + + case 28: + return __fastunpack28_24(in,out); + + case 29: + return __fastunpack29_24(in,out); + + case 30: + return __fastunpack30_24(in,out); + + case 31: + return __fastunpack31_24(in,out); + + case 32: + return __fastunpack32_24(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_24(in,out); + + case 2: + return __fastpackwithoutmask2_24(in,out); + + case 3: + return __fastpackwithoutmask3_24(in,out); + + case 4: + return __fastpackwithoutmask4_24(in,out); + + case 5: + return __fastpackwithoutmask5_24(in,out); + + case 6: + return __fastpackwithoutmask6_24(in,out); + + case 7: + return __fastpackwithoutmask7_24(in,out); + + case 8: + return __fastpackwithoutmask8_24(in,out); + + case 9: + return __fastpackwithoutmask9_24(in,out); + + case 10: + return __fastpackwithoutmask10_24(in,out); + + case 11: + return __fastpackwithoutmask11_24(in,out); + + case 12: + return __fastpackwithoutmask12_24(in,out); + + case 13: + return __fastpackwithoutmask13_24(in,out); + + case 14: + return __fastpackwithoutmask14_24(in,out); + + case 15: + return __fastpackwithoutmask15_24(in,out); + + case 16: + return __fastpackwithoutmask16_24(in,out); + + case 17: + return __fastpackwithoutmask17_24(in,out); + + case 18: + return __fastpackwithoutmask18_24(in,out); + + case 19: + return __fastpackwithoutmask19_24(in,out); + + case 20: + return __fastpackwithoutmask20_24(in,out); + + case 21: + return __fastpackwithoutmask21_24(in,out); + + case 22: + return __fastpackwithoutmask22_24(in,out); + + case 23: + return __fastpackwithoutmask23_24(in,out); + + case 24: + return __fastpackwithoutmask24_24(in,out); + + case 25: + return __fastpackwithoutmask25_24(in,out); + + case 26: + return __fastpackwithoutmask26_24(in,out); + + case 27: + return __fastpackwithoutmask27_24(in,out); + + case 28: + return __fastpackwithoutmask28_24(in,out); + + case 29: + return __fastpackwithoutmask29_24(in,out); + + case 30: + return __fastpackwithoutmask30_24(in,out); + + case 31: + return __fastpackwithoutmask31_24(in,out); + + case 32: + return __fastpackwithoutmask32_24(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker32(const uint32_t * __restrict in, uint32_t * __restrict out) { + memset(out,0,32 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask2_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask3_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask4_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 5 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask6_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask7_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 7 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask8_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 9 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 9 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask10_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask11_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 11 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 11 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask12_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 13 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 13 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 13 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask14_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask15_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 15 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 15 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 15 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask16_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 17 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 17 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 17 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 17 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask18_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask19_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 19 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 19 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 19 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 19 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask20_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 21 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 21 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 21 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 21 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 21 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask22_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask23_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 23 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 23 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 23 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 23 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 23 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask24_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 25 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 25 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 25 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 25 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 25 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 25 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask26_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask27_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 27 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 27 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 27 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 27 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 27 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 27 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask28_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 29 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 29 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 29 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 29 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 29 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 29 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 29 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask30_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask31_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 31 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 31 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 31 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 31 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 31 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + *out = ( (*in) ) >> ( 31 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + *out = ( (*in) ) >> ( 31 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask32_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + +#if 1 +#define DST(__x) out[__x] +#define DSI +#else +#define DST(__x) *out++ +#define DSI +#endif + +const uint32_t * __fastunpack1_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) & 1 ; + DSI; + DST( 1) = ( (*in) >> 1 ) & 1 ; + DSI; + DST( 2) = ( (*in) >> 2 ) & 1 ; + DSI; + DST( 3) = ( (*in) >> 3 ) & 1 ; + DSI; + DST( 4) = ( (*in) >> 4 ) & 1 ; + DSI; + DST( 5) = ( (*in) >> 5 ) & 1 ; + DSI; + DST( 6) = ( (*in) >> 6 ) & 1 ; + DSI; + DST( 7) = ( (*in) >> 7 ) & 1 ; + DSI; + DST( 8) = ( (*in) >> 8 ) & 1 ; + DSI; + DST( 9) = ( (*in) >> 9 ) & 1 ; + DSI; + DST(10) = ( (*in) >> 10 ) & 1 ; + DSI; + DST(11) = ( (*in) >> 11 ) & 1 ; + DSI; + DST(12) = ( (*in) >> 12 ) & 1 ; + DSI; + DST(13) = ( (*in) >> 13 ) & 1 ; + DSI; + DST(14) = ( (*in) >> 14 ) & 1 ; + DSI; + DST(15) = ( (*in) >> 15 ) & 1 ; + DSI; + DST(16) = ( (*in) >> 16 ) & 1 ; + DSI; + DST(17) = ( (*in) >> 17 ) & 1 ; + DSI; + DST(18) = ( (*in) >> 18 ) & 1 ; + DSI; + DST(19) = ( (*in) >> 19 ) & 1 ; + DSI; + DST(20) = ( (*in) >> 20 ) & 1 ; + DSI; + DST(21) = ( (*in) >> 21 ) & 1 ; + DSI; + DST(22) = ( (*in) >> 22 ) & 1 ; + DSI; + DST(23) = ( (*in) >> 23 ) & 1 ; + DSI; + DST(24) = ( (*in) >> 24 ) & 1 ; + DSI; + DST(25) = ( (*in) >> 25 ) & 1 ; + DSI; + DST(26) = ( (*in) >> 26 ) & 1 ; + DSI; + DST(27) = ( (*in) >> 27 ) & 1 ; + DSI; + DST(28) = ( (*in) >> 28 ) & 1 ; + DSI; + DST(29) = ( (*in) >> 29 ) & 1 ; + DSI; + DST(30) = ( (*in) >> 30 ) & 1 ; + DSI; + DST(31) = ( (*in) >> 31 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack2_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; + DSI; + DST( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; + DSI; + DST( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; + DSI; + DST( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; + DSI; + DST( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; + DSI; + DST( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; + DSI; + DST( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; + DSI; + DST( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 2 ) ; + DSI; + DST( 9) = ( (*in) >> 18 ) % (1U << 2 ) ; + DSI; + DST(10) = ( (*in) >> 20 ) % (1U << 2 ) ; + DSI; + DST(11) = ( (*in) >> 22 ) % (1U << 2 ) ; + DSI; + DST(12) = ( (*in) >> 24 ) % (1U << 2 ) ; + DSI; + DST(13) = ( (*in) >> 26 ) % (1U << 2 ) ; + DSI; + DST(14) = ( (*in) >> 28 ) % (1U << 2 ) ; + DSI; + DST(15) = ( (*in) >> 30 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 2 ) ; + DSI; + DST(17) = ( (*in) >> 2 ) % (1U << 2 ) ; + DSI; + DST(18) = ( (*in) >> 4 ) % (1U << 2 ) ; + DSI; + DST(19) = ( (*in) >> 6 ) % (1U << 2 ) ; + DSI; + DST(20) = ( (*in) >> 8 ) % (1U << 2 ) ; + DSI; + DST(21) = ( (*in) >> 10 ) % (1U << 2 ) ; + DSI; + DST(22) = ( (*in) >> 12 ) % (1U << 2 ) ; + DSI; + DST(23) = ( (*in) >> 14 ) % (1U << 2 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 2 ) ; + DSI; + DST(25) = ( (*in) >> 18 ) % (1U << 2 ) ; + DSI; + DST(26) = ( (*in) >> 20 ) % (1U << 2 ) ; + DSI; + DST(27) = ( (*in) >> 22 ) % (1U << 2 ) ; + DSI; + DST(28) = ( (*in) >> 24 ) % (1U << 2 ) ; + DSI; + DST(29) = ( (*in) >> 26 ) % (1U << 2 ) ; + DSI; + DST(30) = ( (*in) >> 28 ) % (1U << 2 ) ; + DSI; + DST(31) = ( (*in) >> 30 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack3_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; + DSI; + DST( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; + DSI; + DST( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; + DSI; + DST( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; + DSI; + DST( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; + DSI; + DST( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; + DSI; + DST( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; + DSI; + DST( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) % (1U << 3 ) ; + DSI; + DST( 9) = ( (*in) >> 27 ) % (1U << 3 ) ; + DSI; + DST(10) = ( (*in) >> 30 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + DSI; + DST(11) = ( (*in) >> 1 ) % (1U << 3 ) ; + DSI; + DST(12) = ( (*in) >> 4 ) % (1U << 3 ) ; + DSI; + DST(13) = ( (*in) >> 7 ) % (1U << 3 ) ; + DSI; + DST(14) = ( (*in) >> 10 ) % (1U << 3 ) ; + DSI; + DST(15) = ( (*in) >> 13 ) % (1U << 3 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 3 ) ; + DSI; + DST(17) = ( (*in) >> 19 ) % (1U << 3 ) ; + DSI; + DST(18) = ( (*in) >> 22 ) % (1U << 3 ) ; + DSI; + DST(19) = ( (*in) >> 25 ) % (1U << 3 ) ; + DSI; + DST(20) = ( (*in) >> 28 ) % (1U << 3 ) ; + DSI; + DST(21) = ( (*in) >> 31 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + DSI; + DST(22) = ( (*in) >> 2 ) % (1U << 3 ) ; + DSI; + DST(23) = ( (*in) >> 5 ) % (1U << 3 ) ; + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 3 ) ; + DSI; + DST(25) = ( (*in) >> 11 ) % (1U << 3 ) ; + DSI; + DST(26) = ( (*in) >> 14 ) % (1U << 3 ) ; + DSI; + DST(27) = ( (*in) >> 17 ) % (1U << 3 ) ; + DSI; + DST(28) = ( (*in) >> 20 ) % (1U << 3 ) ; + DSI; + DST(29) = ( (*in) >> 23 ) % (1U << 3 ) ; + DSI; + DST(30) = ( (*in) >> 26 ) % (1U << 3 ) ; + DSI; + DST(31) = ( (*in) >> 29 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack4_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST( 7) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST( 9) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(10) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(11) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(12) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(13) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(14) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(15) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST(17) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(18) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(19) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(20) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(21) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(22) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(23) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST(25) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(26) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(27) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(28) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(29) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(30) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(31) = ( (*in) >> 28 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack5_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; + DSI; + DST( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; + DSI; + DST( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; + DSI; + DST( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; + DSI; + DST( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; + DSI; + DST( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; + DSI; + DST( 6) = ( (*in) >> 30 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + DSI; + DST( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 5 ) ; + DSI; + DST( 9) = ( (*in) >> 13 ) % (1U << 5 ) ; + DSI; + DST(10) = ( (*in) >> 18 ) % (1U << 5 ) ; + DSI; + DST(11) = ( (*in) >> 23 ) % (1U << 5 ) ; + DSI; + DST(12) = ( (*in) >> 28 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + DSI; + DST(13) = ( (*in) >> 1 ) % (1U << 5 ) ; + DSI; + DST(14) = ( (*in) >> 6 ) % (1U << 5 ) ; + DSI; + DST(15) = ( (*in) >> 11 ) % (1U << 5 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 5 ) ; + DSI; + DST(17) = ( (*in) >> 21 ) % (1U << 5 ) ; + DSI; + DST(18) = ( (*in) >> 26 ) % (1U << 5 ) ; + DSI; + DST(19) = ( (*in) >> 31 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + DSI; + DST(20) = ( (*in) >> 4 ) % (1U << 5 ) ; + DSI; + DST(21) = ( (*in) >> 9 ) % (1U << 5 ) ; + DSI; + DST(22) = ( (*in) >> 14 ) % (1U << 5 ) ; + DSI; + DST(23) = ( (*in) >> 19 ) % (1U << 5 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) % (1U << 5 ) ; + DSI; + DST(25) = ( (*in) >> 29 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); + DSI; + DST(26) = ( (*in) >> 2 ) % (1U << 5 ) ; + DSI; + DST(27) = ( (*in) >> 7 ) % (1U << 5 ) ; + DSI; + DST(28) = ( (*in) >> 12 ) % (1U << 5 ) ; + DSI; + DST(29) = ( (*in) >> 17 ) % (1U << 5 ) ; + DSI; + DST(30) = ( (*in) >> 22 ) % (1U << 5 ) ; + DSI; + DST(31) = ( (*in) >> 27 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack6_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; + DSI; + DST( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; + DSI; + DST( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; + DSI; + DST( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; + DSI; + DST( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; + DSI; + DST( 5) = ( (*in) >> 30 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + DSI; + DST( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; + DSI; + DST( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 6 ) ; + DSI; + DST( 9) = ( (*in) >> 22 ) % (1U << 6 ) ; + DSI; + DST(10) = ( (*in) >> 28 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + DSI; + DST(11) = ( (*in) >> 2 ) % (1U << 6 ) ; + DSI; + DST(12) = ( (*in) >> 8 ) % (1U << 6 ) ; + DSI; + DST(13) = ( (*in) >> 14 ) % (1U << 6 ) ; + DSI; + DST(14) = ( (*in) >> 20 ) % (1U << 6 ) ; + DSI; + DST(15) = ( (*in) >> 26 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 6 ) ; + DSI; + DST(17) = ( (*in) >> 6 ) % (1U << 6 ) ; + DSI; + DST(18) = ( (*in) >> 12 ) % (1U << 6 ) ; + DSI; + DST(19) = ( (*in) >> 18 ) % (1U << 6 ) ; + DSI; + DST(20) = ( (*in) >> 24 ) % (1U << 6 ) ; + DSI; + DST(21) = ( (*in) >> 30 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + DSI; + DST(22) = ( (*in) >> 4 ) % (1U << 6 ) ; + DSI; + DST(23) = ( (*in) >> 10 ) % (1U << 6 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 6 ) ; + DSI; + DST(25) = ( (*in) >> 22 ) % (1U << 6 ) ; + DSI; + DST(26) = ( (*in) >> 28 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + DSI; + DST(27) = ( (*in) >> 2 ) % (1U << 6 ) ; + DSI; + DST(28) = ( (*in) >> 8 ) % (1U << 6 ) ; + DSI; + DST(29) = ( (*in) >> 14 ) % (1U << 6 ) ; + DSI; + DST(30) = ( (*in) >> 20 ) % (1U << 6 ) ; + DSI; + DST(31) = ( (*in) >> 26 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack7_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; + DSI; + DST( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; + DSI; + DST( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; + DSI; + DST( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; + DSI; + DST( 4) = ( (*in) >> 28 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + DSI; + DST( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; + DSI; + DST( 6) = ( (*in) >> 10 ) % (1U << 7 ) ; + DSI; + DST( 7) = ( (*in) >> 17 ) % (1U << 7 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) % (1U << 7 ) ; + DSI; + DST( 9) = ( (*in) >> 31 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + DSI; + DST(10) = ( (*in) >> 6 ) % (1U << 7 ) ; + DSI; + DST(11) = ( (*in) >> 13 ) % (1U << 7 ) ; + DSI; + DST(12) = ( (*in) >> 20 ) % (1U << 7 ) ; + DSI; + DST(13) = ( (*in) >> 27 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + DSI; + DST(14) = ( (*in) >> 2 ) % (1U << 7 ) ; + DSI; + DST(15) = ( (*in) >> 9 ) % (1U << 7 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 7 ) ; + DSI; + DST(17) = ( (*in) >> 23 ) % (1U << 7 ) ; + DSI; + DST(18) = ( (*in) >> 30 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + DSI; + DST(19) = ( (*in) >> 5 ) % (1U << 7 ) ; + DSI; + DST(20) = ( (*in) >> 12 ) % (1U << 7 ) ; + DSI; + DST(21) = ( (*in) >> 19 ) % (1U << 7 ) ; + DSI; + DST(22) = ( (*in) >> 26 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + DSI; + DST(23) = ( (*in) >> 1 ) % (1U << 7 ) ; + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 7 ) ; + DSI; + DST(25) = ( (*in) >> 15 ) % (1U << 7 ) ; + DSI; + DST(26) = ( (*in) >> 22 ) % (1U << 7 ) ; + DSI; + DST(27) = ( (*in) >> 29 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); + DSI; + DST(28) = ( (*in) >> 4 ) % (1U << 7 ) ; + DSI; + DST(29) = ( (*in) >> 11 ) % (1U << 7 ) ; + DSI; + DST(30) = ( (*in) >> 18 ) % (1U << 7 ) ; + DSI; + DST(31) = ( (*in) >> 25 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack8_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST( 3) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST( 7) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 9) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(10) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(11) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(12) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(13) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(14) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(15) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(17) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(18) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(19) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(20) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(21) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(22) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(23) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(25) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(26) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(27) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(28) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(29) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(30) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(31) = ( (*in) >> 24 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack9_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; + DSI; + DST( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; + DSI; + DST( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; + DSI; + DST( 3) = ( (*in) >> 27 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + DSI; + DST( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; + DSI; + DST( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; + DSI; + DST( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; + DSI; + DST( 7) = ( (*in) >> 31 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 9 ) ; + DSI; + DST( 9) = ( (*in) >> 17 ) % (1U << 9 ) ; + DSI; + DST(10) = ( (*in) >> 26 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + DSI; + DST(11) = ( (*in) >> 3 ) % (1U << 9 ) ; + DSI; + DST(12) = ( (*in) >> 12 ) % (1U << 9 ) ; + DSI; + DST(13) = ( (*in) >> 21 ) % (1U << 9 ) ; + DSI; + DST(14) = ( (*in) >> 30 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + DSI; + DST(15) = ( (*in) >> 7 ) % (1U << 9 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 9 ) ; + DSI; + DST(17) = ( (*in) >> 25 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + DSI; + DST(18) = ( (*in) >> 2 ) % (1U << 9 ) ; + DSI; + DST(19) = ( (*in) >> 11 ) % (1U << 9 ) ; + DSI; + DST(20) = ( (*in) >> 20 ) % (1U << 9 ) ; + DSI; + DST(21) = ( (*in) >> 29 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + DSI; + DST(22) = ( (*in) >> 6 ) % (1U << 9 ) ; + DSI; + DST(23) = ( (*in) >> 15 ) % (1U << 9 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); + DSI; + DST(25) = ( (*in) >> 1 ) % (1U << 9 ) ; + DSI; + DST(26) = ( (*in) >> 10 ) % (1U << 9 ) ; + DSI; + DST(27) = ( (*in) >> 19 ) % (1U << 9 ) ; + DSI; + DST(28) = ( (*in) >> 28 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); + DSI; + DST(29) = ( (*in) >> 5 ) % (1U << 9 ) ; + DSI; + DST(30) = ( (*in) >> 14 ) % (1U << 9 ) ; + DSI; + DST(31) = ( (*in) >> 23 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack10_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; + DSI; + DST( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; + DSI; + DST( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; + DSI; + DST( 3) = ( (*in) >> 30 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + DSI; + DST( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; + DSI; + DST( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; + DSI; + DST( 6) = ( (*in) >> 28 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + DSI; + DST( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 10 ) ; + DSI; + DST( 9) = ( (*in) >> 26 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + DSI; + DST(10) = ( (*in) >> 4 ) % (1U << 10 ) ; + DSI; + DST(11) = ( (*in) >> 14 ) % (1U << 10 ) ; + DSI; + DST(12) = ( (*in) >> 24 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + DSI; + DST(13) = ( (*in) >> 2 ) % (1U << 10 ) ; + DSI; + DST(14) = ( (*in) >> 12 ) % (1U << 10 ) ; + DSI; + DST(15) = ( (*in) >> 22 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 10 ) ; + DSI; + DST(17) = ( (*in) >> 10 ) % (1U << 10 ) ; + DSI; + DST(18) = ( (*in) >> 20 ) % (1U << 10 ) ; + DSI; + DST(19) = ( (*in) >> 30 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + DSI; + DST(20) = ( (*in) >> 8 ) % (1U << 10 ) ; + DSI; + DST(21) = ( (*in) >> 18 ) % (1U << 10 ) ; + DSI; + DST(22) = ( (*in) >> 28 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + DSI; + DST(23) = ( (*in) >> 6 ) % (1U << 10 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 10 ) ; + DSI; + DST(25) = ( (*in) >> 26 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + DSI; + DST(26) = ( (*in) >> 4 ) % (1U << 10 ) ; + DSI; + DST(27) = ( (*in) >> 14 ) % (1U << 10 ) ; + DSI; + DST(28) = ( (*in) >> 24 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + DSI; + DST(29) = ( (*in) >> 2 ) % (1U << 10 ) ; + DSI; + DST(30) = ( (*in) >> 12 ) % (1U << 10 ) ; + DSI; + DST(31) = ( (*in) >> 22 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack11_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 11 ) ; + DSI; + DST( 1) = ( (*in) >> 11 ) % (1U << 11 ) ; + DSI; + DST( 2) = ( (*in) >> 22 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + DSI; + DST( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; + DSI; + DST( 4) = ( (*in) >> 12 ) % (1U << 11 ) ; + DSI; + DST( 5) = ( (*in) >> 23 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + DSI; + DST( 6) = ( (*in) >> 2 ) % (1U << 11 ) ; + DSI; + DST( 7) = ( (*in) >> 13 ) % (1U << 11 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) ; + ++in; + DST( 8) |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + DSI; + DST( 9) = ( (*in) >> 3 ) % (1U << 11 ) ; + DSI; + DST(10) = ( (*in) >> 14 ) % (1U << 11 ) ; + DSI; + DST(11) = ( (*in) >> 25 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + DSI; + DST(12) = ( (*in) >> 4 ) % (1U << 11 ) ; + DSI; + DST(13) = ( (*in) >> 15 ) % (1U << 11 ) ; + DSI; + DST(14) = ( (*in) >> 26 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + DSI; + DST(15) = ( (*in) >> 5 ) % (1U << 11 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 11 ) ; + DSI; + DST(17) = ( (*in) >> 27 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + DSI; + DST(18) = ( (*in) >> 6 ) % (1U << 11 ) ; + DSI; + DST(19) = ( (*in) >> 17 ) % (1U << 11 ) ; + DSI; + DST(20) = ( (*in) >> 28 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + DSI; + DST(21) = ( (*in) >> 7 ) % (1U << 11 ) ; + DSI; + DST(22) = ( (*in) >> 18 ) % (1U << 11 ) ; + DSI; + DST(23) = ( (*in) >> 29 ) ; + ++in; + DST(23) |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 11 ) ; + DSI; + DST(25) = ( (*in) >> 19 ) % (1U << 11 ) ; + DSI; + DST(26) = ( (*in) >> 30 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); + DSI; + DST(27) = ( (*in) >> 9 ) % (1U << 11 ) ; + DSI; + DST(28) = ( (*in) >> 20 ) % (1U << 11 ) ; + DSI; + DST(29) = ( (*in) >> 31 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); + DSI; + DST(30) = ( (*in) >> 10 ) % (1U << 11 ) ; + DSI; + DST(31) = ( (*in) >> 21 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack12_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST( 2) = ( (*in) >> 24 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST( 5) = ( (*in) >> 28 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST( 7) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST( 9) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(10) = ( (*in) >> 24 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(11) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(12) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(13) = ( (*in) >> 28 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(14) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(15) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST(17) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(18) = ( (*in) >> 24 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(19) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(20) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(21) = ( (*in) >> 28 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(22) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(23) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST(25) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(26) = ( (*in) >> 24 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(27) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(28) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(29) = ( (*in) >> 28 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(30) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(31) = ( (*in) >> 20 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack13_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; + DSI; + DST( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; + DSI; + DST( 2) = ( (*in) >> 26 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + DSI; + DST( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; + DSI; + DST( 4) = ( (*in) >> 20 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + DSI; + DST( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; + DSI; + DST( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; + DSI; + DST( 7) = ( (*in) >> 27 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 13 ) ; + DSI; + DST( 9) = ( (*in) >> 21 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + DSI; + DST(10) = ( (*in) >> 2 ) % (1U << 13 ) ; + DSI; + DST(11) = ( (*in) >> 15 ) % (1U << 13 ) ; + DSI; + DST(12) = ( (*in) >> 28 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + DSI; + DST(13) = ( (*in) >> 9 ) % (1U << 13 ) ; + DSI; + DST(14) = ( (*in) >> 22 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + DSI; + DST(15) = ( (*in) >> 3 ) % (1U << 13 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 13 ) ; + DSI; + DST(17) = ( (*in) >> 29 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + DSI; + DST(18) = ( (*in) >> 10 ) % (1U << 13 ) ; + DSI; + DST(19) = ( (*in) >> 23 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + DSI; + DST(20) = ( (*in) >> 4 ) % (1U << 13 ) ; + DSI; + DST(21) = ( (*in) >> 17 ) % (1U << 13 ) ; + DSI; + DST(22) = ( (*in) >> 30 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + DSI; + DST(23) = ( (*in) >> 11 ) % (1U << 13 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); + DSI; + DST(25) = ( (*in) >> 5 ) % (1U << 13 ) ; + DSI; + DST(26) = ( (*in) >> 18 ) % (1U << 13 ) ; + DSI; + DST(27) = ( (*in) >> 31 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); + DSI; + DST(28) = ( (*in) >> 12 ) % (1U << 13 ) ; + DSI; + DST(29) = ( (*in) >> 25 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); + DSI; + DST(30) = ( (*in) >> 6 ) % (1U << 13 ) ; + DSI; + DST(31) = ( (*in) >> 19 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack14_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 14 ) ; + DSI; + DST( 1) = ( (*in) >> 14 ) % (1U << 14 ) ; + DSI; + DST( 2) = ( (*in) >> 28 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + DSI; + DST( 3) = ( (*in) >> 10 ) % (1U << 14 ) ; + DSI; + DST( 4) = ( (*in) >> 24 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + DSI; + DST( 5) = ( (*in) >> 6 ) % (1U << 14 ) ; + DSI; + DST( 6) = ( (*in) >> 20 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + DSI; + DST( 7) = ( (*in) >> 2 ) % (1U << 14 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 14 ) ; + DSI; + DST( 9) = ( (*in) >> 30 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + DSI; + DST(10) = ( (*in) >> 12 ) % (1U << 14 ) ; + DSI; + DST(11) = ( (*in) >> 26 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + DSI; + DST(12) = ( (*in) >> 8 ) % (1U << 14 ) ; + DSI; + DST(13) = ( (*in) >> 22 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + DSI; + DST(14) = ( (*in) >> 4 ) % (1U << 14 ) ; + DSI; + DST(15) = ( (*in) >> 18 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 14 ) ; + DSI; + DST(17) = ( (*in) >> 14 ) % (1U << 14 ) ; + DSI; + DST(18) = ( (*in) >> 28 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + DSI; + DST(19) = ( (*in) >> 10 ) % (1U << 14 ) ; + DSI; + DST(20) = ( (*in) >> 24 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + DSI; + DST(21) = ( (*in) >> 6 ) % (1U << 14 ) ; + DSI; + DST(22) = ( (*in) >> 20 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + DSI; + DST(23) = ( (*in) >> 2 ) % (1U << 14 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 14 ) ; + DSI; + DST(25) = ( (*in) >> 30 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + DSI; + DST(26) = ( (*in) >> 12 ) % (1U << 14 ) ; + DSI; + DST(27) = ( (*in) >> 26 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + DSI; + DST(28) = ( (*in) >> 8 ) % (1U << 14 ) ; + DSI; + DST(29) = ( (*in) >> 22 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + DSI; + DST(30) = ( (*in) >> 4 ) % (1U << 14 ) ; + DSI; + DST(31) = ( (*in) >> 18 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack15_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 15 ) ; + DSI; + DST( 1) = ( (*in) >> 15 ) % (1U << 15 ) ; + DSI; + DST( 2) = ( (*in) >> 30 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + DSI; + DST( 3) = ( (*in) >> 13 ) % (1U << 15 ) ; + DSI; + DST( 4) = ( (*in) >> 28 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + DSI; + DST( 5) = ( (*in) >> 11 ) % (1U << 15 ) ; + DSI; + DST( 6) = ( (*in) >> 26 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + DSI; + DST( 7) = ( (*in) >> 9 ) % (1U << 15 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) ; + ++in; + DST( 8) |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + DSI; + DST( 9) = ( (*in) >> 7 ) % (1U << 15 ) ; + DSI; + DST(10) = ( (*in) >> 22 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + DSI; + DST(11) = ( (*in) >> 5 ) % (1U << 15 ) ; + DSI; + DST(12) = ( (*in) >> 20 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + DSI; + DST(13) = ( (*in) >> 3 ) % (1U << 15 ) ; + DSI; + DST(14) = ( (*in) >> 18 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + DSI; + DST(15) = ( (*in) >> 1 ) % (1U << 15 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 15 ) ; + DSI; + DST(17) = ( (*in) >> 31 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + DSI; + DST(18) = ( (*in) >> 14 ) % (1U << 15 ) ; + DSI; + DST(19) = ( (*in) >> 29 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + DSI; + DST(20) = ( (*in) >> 12 ) % (1U << 15 ) ; + DSI; + DST(21) = ( (*in) >> 27 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + DSI; + DST(22) = ( (*in) >> 10 ) % (1U << 15 ) ; + DSI; + DST(23) = ( (*in) >> 25 ) ; + ++in; + DST(23) |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 15 ) ; + DSI; + DST(25) = ( (*in) >> 23 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); + DSI; + DST(26) = ( (*in) >> 6 ) % (1U << 15 ) ; + DSI; + DST(27) = ( (*in) >> 21 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); + DSI; + DST(28) = ( (*in) >> 4 ) % (1U << 15 ) ; + DSI; + DST(29) = ( (*in) >> 19 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); + DSI; + DST(30) = ( (*in) >> 2 ) % (1U << 15 ) ; + DSI; + DST(31) = ( (*in) >> 17 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack16_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 1) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 2) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 3) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 4) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 5) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 6) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 7) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 9) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(10) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(11) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(12) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(13) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(14) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(15) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(17) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(18) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(19) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(20) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(21) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(22) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(23) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(25) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(26) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(27) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(28) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(29) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(30) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(31) = ( (*in) >> 16 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack17_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 17 ) ; + DSI; + DST( 1) = ( (*in) >> 17 ) ; + ++in; + DST( 1) |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + DSI; + DST( 2) = ( (*in) >> 2 ) % (1U << 17 ) ; + DSI; + DST( 3) = ( (*in) >> 19 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + DSI; + DST( 4) = ( (*in) >> 4 ) % (1U << 17 ) ; + DSI; + DST( 5) = ( (*in) >> 21 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + DSI; + DST( 6) = ( (*in) >> 6 ) % (1U << 17 ) ; + DSI; + DST( 7) = ( (*in) >> 23 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 17 ) ; + DSI; + DST( 9) = ( (*in) >> 25 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + DSI; + DST(10) = ( (*in) >> 10 ) % (1U << 17 ) ; + DSI; + DST(11) = ( (*in) >> 27 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + DSI; + DST(12) = ( (*in) >> 12 ) % (1U << 17 ) ; + DSI; + DST(13) = ( (*in) >> 29 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + DSI; + DST(14) = ( (*in) >> 14 ) % (1U << 17 ) ; + DSI; + DST(15) = ( (*in) >> 31 ) ; + ++in; + DST(15) |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + DSI; + DST(16) = ( (*in) >> 16 ) ; + ++in; + DST(16) |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + DSI; + DST(17) = ( (*in) >> 1 ) % (1U << 17 ) ; + DSI; + DST(18) = ( (*in) >> 18 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + DSI; + DST(19) = ( (*in) >> 3 ) % (1U << 17 ) ; + DSI; + DST(20) = ( (*in) >> 20 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + DSI; + DST(21) = ( (*in) >> 5 ) % (1U << 17 ) ; + DSI; + DST(22) = ( (*in) >> 22 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + DSI; + DST(23) = ( (*in) >> 7 ) % (1U << 17 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); + DSI; + DST(25) = ( (*in) >> 9 ) % (1U << 17 ) ; + DSI; + DST(26) = ( (*in) >> 26 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); + DSI; + DST(27) = ( (*in) >> 11 ) % (1U << 17 ) ; + DSI; + DST(28) = ( (*in) >> 28 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); + DSI; + DST(29) = ( (*in) >> 13 ) % (1U << 17 ) ; + DSI; + DST(30) = ( (*in) >> 30 ) ; + ++in; + DST(30) |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); + DSI; + DST(31) = ( (*in) >> 15 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack18_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack19_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack20_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack22_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack23_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack24_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack26_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack27_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack28_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack30_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack31_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); + out++; + *out = ( (*in) >> 1 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack32_32(const uint32_t * __restrict in, uint32_t * __restrict out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker32(in,out); + + case 1: + return __fastunpack1_32(in,out); + + case 2: + return __fastunpack2_32(in,out); + + case 3: + return __fastunpack3_32(in,out); + + case 4: + return __fastunpack4_32(in,out); + + case 5: + return __fastunpack5_32(in,out); + + case 6: + return __fastunpack6_32(in,out); + + case 7: + return __fastunpack7_32(in,out); + + case 8: + return __fastunpack8_32(in,out); + + case 9: + return __fastunpack9_32(in,out); + + case 10: + return __fastunpack10_32(in,out); + + case 11: + return __fastunpack11_32(in,out); + + case 12: + return __fastunpack12_32(in,out); + + case 13: + return __fastunpack13_32(in,out); + + case 14: + return __fastunpack14_32(in,out); + + case 15: + return __fastunpack15_32(in,out); + + case 16: + return __fastunpack16_32(in,out); + + case 17: + return __fastunpack17_32(in,out); + + case 18: + return __fastunpack18_32(in,out); + + case 19: + return __fastunpack19_32(in,out); + + case 20: + return __fastunpack20_32(in,out); + + case 21: + return __fastunpack21_32(in,out); + + case 22: + return __fastunpack22_32(in,out); + + case 23: + return __fastunpack23_32(in,out); + + case 24: + return __fastunpack24_32(in,out); + + case 25: + return __fastunpack25_32(in,out); + + case 26: + return __fastunpack26_32(in,out); + + case 27: + return __fastunpack27_32(in,out); + + case 28: + return __fastunpack28_32(in,out); + + case 29: + return __fastunpack29_32(in,out); + + case 30: + return __fastunpack30_32(in,out); + + case 31: + return __fastunpack31_32(in,out); + + case 32: + return __fastunpack32_32(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_32(in,out); + + case 2: + return __fastpackwithoutmask2_32(in,out); + + case 3: + return __fastpackwithoutmask3_32(in,out); + + case 4: + return __fastpackwithoutmask4_32(in,out); + + case 5: + return __fastpackwithoutmask5_32(in,out); + + case 6: + return __fastpackwithoutmask6_32(in,out); + + case 7: + return __fastpackwithoutmask7_32(in,out); + + case 8: + return __fastpackwithoutmask8_32(in,out); + + case 9: + return __fastpackwithoutmask9_32(in,out); + + case 10: + return __fastpackwithoutmask10_32(in,out); + + case 11: + return __fastpackwithoutmask11_32(in,out); + + case 12: + return __fastpackwithoutmask12_32(in,out); + + case 13: + return __fastpackwithoutmask13_32(in,out); + + case 14: + return __fastpackwithoutmask14_32(in,out); + + case 15: + return __fastpackwithoutmask15_32(in,out); + + case 16: + return __fastpackwithoutmask16_32(in,out); + + case 17: + return __fastpackwithoutmask17_32(in,out); + + case 18: + return __fastpackwithoutmask18_32(in,out); + + case 19: + return __fastpackwithoutmask19_32(in,out); + + case 20: + return __fastpackwithoutmask20_32(in,out); + + case 21: + return __fastpackwithoutmask21_32(in,out); + + case 22: + return __fastpackwithoutmask22_32(in,out); + + case 23: + return __fastpackwithoutmask23_32(in,out); + + case 24: + return __fastpackwithoutmask24_32(in,out); + + case 25: + return __fastpackwithoutmask25_32(in,out); + + case 26: + return __fastpackwithoutmask26_32(in,out); + + case 27: + return __fastpackwithoutmask27_32(in,out); + + case 28: + return __fastpackwithoutmask28_32(in,out); + + case 29: + return __fastpackwithoutmask29_32(in,out); + + case 30: + return __fastpackwithoutmask30_32(in,out); + + case 31: + return __fastpackwithoutmask31_32(in,out); + + case 32: + return __fastpackwithoutmask32_32(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + diff --git a/ext/simdcomp/bitpacka.h b/ext/simdcomp/bitpacka.h new file mode 100644 index 0000000..6fa76c8 --- /dev/null +++ b/ext/simdcomp/bitpacka.h @@ -0,0 +1,28 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef BITPACKINGALIGNED +#define BITPACKINGALIGNED +#include +#include +#include + +const uint32_t * fastunpack_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); +uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); + +const uint32_t * fastunpack_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); +uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); + +const uint32_t * fastunpack_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); +uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); + +const uint32_t * fastunpack_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); + +uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit); + + + +#endif // BITPACKINGALIGNED diff --git a/ext/simdcomp/example.c b/ext/simdcomp/example.c new file mode 100644 index 0000000..0394e20 --- /dev/null +++ b/ext/simdcomp/example.c @@ -0,0 +1,66 @@ +#include +#include +#include "simdcomp.h" + + +// compresses data from datain to buffer, returns how many bytes written +size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + uint32_t offset = 0; + uint8_t * initout = buffer; + for(size_t k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t b = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + *buffer++ = b; + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, + b); + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + buffer += b * sizeof(__m128i); + } + return buffer - initout; +} + + +int main() { + int REPEAT = 5; + int N = 1000000 * SIMDBlockSize;//SIMDBlockSize is 128 + uint32_t * datain = malloc(N * sizeof(uint32_t)); + size_t compsize; + clock_t start, end; + + uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); // output buffer + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (int gap = 1; gap <= 243; gap *= 3) { + printf("\n"); + printf(" gap = %u \n", gap); + for (int k = 0; k < N; ++k) + datain[k] = k * gap; + uint32_t offset = 0; + compsize = compress(datain,N,buffer); + printf("compression rate = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); + start = clock(); + uint32_t bogus = 0; + for(int repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (int k = 0; k * SIMDBlockSize < N; ++k) { + uint8_t b = *decbuffer++; + simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); + // do something here with backbuffer + bogus += backbuffer[3]; + decbuffer += b * sizeof(__m128i); + offset = backbuffer[SIMDBlockSize - 1]; + } + } + end = clock(); + double numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); + printf("ignore me %i \n",bogus); + } + free(buffer); + free(datain); + free(backbuffer); + return 0; +} + diff --git a/ext/simdcomp/include/simdbitpacking.h b/ext/simdcomp/include/simdbitpacking.h new file mode 100644 index 0000000..301f4f5 --- /dev/null +++ b/ext/simdcomp/include/simdbitpacking.h @@ -0,0 +1,21 @@ +/** + * This code is released under a BSD License. + */ +#ifndef SIMDBITPACKING_H_ +#define SIMDBITPACKING_H_ + +#include // SSE2 is required +#include // use a C99-compliant compiler, please +#include // for memset + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +void simdpack(const uint32_t * in,__m128i * out, uint32_t bit); + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +void simdpackwithoutmask(const uint32_t * in,__m128i * out, uint32_t bit); + +//reads "bit" 128-bit vectors from "in", writes 128 values to "out" +void simdunpack(const __m128i * in,uint32_t * out, uint32_t bit); + + +#endif /* SIMDBITPACKING_H_ */ diff --git a/ext/simdcomp/include/simdcomp.h b/ext/simdcomp/include/simdcomp.h new file mode 100644 index 0000000..8875f0f --- /dev/null +++ b/ext/simdcomp/include/simdcomp.h @@ -0,0 +1,12 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMDCOMP_H_ +#define SIMDCOMP_H_ + +#include "simdbitpacking.h" +#include "simdcomputil.h" +#include "simdintegratedbitpacking.h" + +#endif diff --git a/ext/simdcomp/include/simdcomputil.h b/ext/simdcomp/include/simdcomputil.h new file mode 100644 index 0000000..107665b --- /dev/null +++ b/ext/simdcomp/include/simdcomputil.h @@ -0,0 +1,29 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMDCOMPUTIL_H_ +#define SIMDCOMPUTIL_H_ + +#include // SSE2 is required +#include // use a C99-compliant compiler, please + + + + +// returns the integer logarithm of v (bit width) +uint32_t bits(const uint32_t v); + +// max integer logarithm over a range of SIMDBlockSize integers (128 integer) +uint32_t maxbits(const uint32_t * begin); + +enum{ SIMDBlockSize = 128}; + +// like maxbit over 128 integers (SIMDBlockSize) with provided initial value +// and using differential coding +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in); + + + + +#endif /* SIMDCOMPUTIL_H_ */ diff --git a/ext/simdcomp/include/simdintegratedbitpacking.h b/ext/simdcomp/include/simdintegratedbitpacking.h new file mode 100644 index 0000000..18ca795 --- /dev/null +++ b/ext/simdcomp/include/simdintegratedbitpacking.h @@ -0,0 +1,27 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMD_INTEGRATED_BITPACKING_H +#define SIMD_INTEGRATED_BITPACKING_H + +#include // SSE2 is required +#include // use a C99-compliant compiler, please + +#include "simdcomputil.h" + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +// integer values should be in sorted order (for best results) +void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); + + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +// integer values should be in sorted order (for best results) +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); + + +//reads "bit" 128-bit vectors from "in", writes 128 values to "out" +void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, uint32_t bit); + + +#endif diff --git a/ext/simdcomp/makefile b/ext/simdcomp/makefile new file mode 100644 index 0000000..6ebd9d9 --- /dev/null +++ b/ext/simdcomp/makefile @@ -0,0 +1,54 @@ +# minimalist makefile +.SUFFIXES: +# +.SUFFIXES: .cpp .o .c .h + +CFLAGS = -fPIC -std=c99 -O3 -Wall -Wextra -Wno-unused-parameter -pedantic +LDFLAGS = -shared +LIBNAME=libsimdcomp.so.0.0.3 +all: unit $(LIBNAME) +test: + ./unit +install: $(OBJECTS) + cp $(LIBNAME) /usr/local/lib + ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so + ldconfig + cp $(HEADERS) /usr/local/include + + + +HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h + +uninstall: + for h in $(HEADERS) ; do rm /usr/local/$$h; done + rm /usr/local/lib/$(LIBNAME) + rm /usr/local/lib/libsimdcomp.so + ldconfig + + +OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o + +$(LIBNAME): $(OBJECTS) + $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) + + + +simdcomputil.o: ./src/simdcomputil.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude + +simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude + +simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude + +example: ./example.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) + +unit: ./src/unit.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit ./src/unit.c -Iinclude $(OBJECTS) +dynunit: ./src/unit.c $(HEADERS) $(LIBNAME) + $(CC) $(CFLAGS) -o dynunit ./src/unit.c -Iinclude -lsimdcomp + +clean: + rm -f unit *.o $(LIBNAME) diff --git a/ext/simdcomp/src/simdbitpacking.c b/ext/simdcomp/src/simdbitpacking.c new file mode 100644 index 0000000..7137682 --- /dev/null +++ b/ext/simdcomp/src/simdbitpacking.c @@ -0,0 +1,14009 @@ +/** + * This code is released under a BSD License. + */ +#include "../include/simdbitpacking.h" + + +static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) { + (void) _in; + memset(out,0,32 * 4 * 4); +} + +static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_loadu_si128(in+4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_loadu_si128(in+6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_loadu_si128(in+7); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + +static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + uint32_t outer; + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + uint32_t outer; + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + uint32_t outer; + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + + +static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + unsigned i; + for (i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} + + + + +static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,1) ; + _mm_storeu_si128(out++, OutReg); + + +} + + +void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + uint32_t outer; + for(outer=0; outer< 32 ;++outer) { + _mm_storeu_si128(out++, _mm_loadu_si128(in++)); + } +} + + + +void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) { + switch(bit) { + case 0: SIMD_nullunpacker32(in,out); return; + + case 1: __SIMD_fastunpack1_32(in,out); return; + + case 2: __SIMD_fastunpack2_32(in,out); return; + + case 3: __SIMD_fastunpack3_32(in,out); return; + + case 4: __SIMD_fastunpack4_32(in,out); return; + + case 5: __SIMD_fastunpack5_32(in,out); return; + + case 6: __SIMD_fastunpack6_32(in,out); return; + + case 7: __SIMD_fastunpack7_32(in,out); return; + + case 8: __SIMD_fastunpack8_32(in,out); return; + + case 9: __SIMD_fastunpack9_32(in,out); return; + + case 10: __SIMD_fastunpack10_32(in,out); return; + + case 11: __SIMD_fastunpack11_32(in,out); return; + + case 12: __SIMD_fastunpack12_32(in,out); return; + + case 13: __SIMD_fastunpack13_32(in,out); return; + + case 14: __SIMD_fastunpack14_32(in,out); return; + + case 15: __SIMD_fastunpack15_32(in,out); return; + + case 16: __SIMD_fastunpack16_32(in,out); return; + + case 17: __SIMD_fastunpack17_32(in,out); return; + + case 18: __SIMD_fastunpack18_32(in,out); return; + + case 19: __SIMD_fastunpack19_32(in,out); return; + + case 20: __SIMD_fastunpack20_32(in,out); return; + + case 21: __SIMD_fastunpack21_32(in,out); return; + + case 22: __SIMD_fastunpack22_32(in,out); return; + + case 23: __SIMD_fastunpack23_32(in,out); return; + + case 24: __SIMD_fastunpack24_32(in,out); return; + + case 25: __SIMD_fastunpack25_32(in,out); return; + + case 26: __SIMD_fastunpack26_32(in,out); return; + + case 27: __SIMD_fastunpack27_32(in,out); return; + + case 28: __SIMD_fastunpack28_32(in,out); return; + + case 29: __SIMD_fastunpack29_32(in,out); return; + + case 30: __SIMD_fastunpack30_32(in,out); return; + + case 31: __SIMD_fastunpack31_32(in,out); return; + + case 32: __SIMD_fastunpack32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpackwithoutmask1_32(in,out); return; + + case 2: __SIMD_fastpackwithoutmask2_32(in,out); return; + + case 3: __SIMD_fastpackwithoutmask3_32(in,out); return; + + case 4: __SIMD_fastpackwithoutmask4_32(in,out); return; + + case 5: __SIMD_fastpackwithoutmask5_32(in,out); return; + + case 6: __SIMD_fastpackwithoutmask6_32(in,out); return; + + case 7: __SIMD_fastpackwithoutmask7_32(in,out); return; + + case 8: __SIMD_fastpackwithoutmask8_32(in,out); return; + + case 9: __SIMD_fastpackwithoutmask9_32(in,out); return; + + case 10: __SIMD_fastpackwithoutmask10_32(in,out); return; + + case 11: __SIMD_fastpackwithoutmask11_32(in,out); return; + + case 12: __SIMD_fastpackwithoutmask12_32(in,out); return; + + case 13: __SIMD_fastpackwithoutmask13_32(in,out); return; + + case 14: __SIMD_fastpackwithoutmask14_32(in,out); return; + + case 15: __SIMD_fastpackwithoutmask15_32(in,out); return; + + case 16: __SIMD_fastpackwithoutmask16_32(in,out); return; + + case 17: __SIMD_fastpackwithoutmask17_32(in,out); return; + + case 18: __SIMD_fastpackwithoutmask18_32(in,out); return; + + case 19: __SIMD_fastpackwithoutmask19_32(in,out); return; + + case 20: __SIMD_fastpackwithoutmask20_32(in,out); return; + + case 21: __SIMD_fastpackwithoutmask21_32(in,out); return; + + case 22: __SIMD_fastpackwithoutmask22_32(in,out); return; + + case 23: __SIMD_fastpackwithoutmask23_32(in,out); return; + + case 24: __SIMD_fastpackwithoutmask24_32(in,out); return; + + case 25: __SIMD_fastpackwithoutmask25_32(in,out); return; + + case 26: __SIMD_fastpackwithoutmask26_32(in,out); return; + + case 27: __SIMD_fastpackwithoutmask27_32(in,out); return; + + case 28: __SIMD_fastpackwithoutmask28_32(in,out); return; + + case 29: __SIMD_fastpackwithoutmask29_32(in,out); return; + + case 30: __SIMD_fastpackwithoutmask30_32(in,out); return; + + case 31: __SIMD_fastpackwithoutmask31_32(in,out); return; + + case 32: __SIMD_fastpackwithoutmask32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpack1_32(in,out); return; + + case 2: __SIMD_fastpack2_32(in,out); return; + + case 3: __SIMD_fastpack3_32(in,out); return; + + case 4: __SIMD_fastpack4_32(in,out); return; + + case 5: __SIMD_fastpack5_32(in,out); return; + + case 6: __SIMD_fastpack6_32(in,out); return; + + case 7: __SIMD_fastpack7_32(in,out); return; + + case 8: __SIMD_fastpack8_32(in,out); return; + + case 9: __SIMD_fastpack9_32(in,out); return; + + case 10: __SIMD_fastpack10_32(in,out); return; + + case 11: __SIMD_fastpack11_32(in,out); return; + + case 12: __SIMD_fastpack12_32(in,out); return; + + case 13: __SIMD_fastpack13_32(in,out); return; + + case 14: __SIMD_fastpack14_32(in,out); return; + + case 15: __SIMD_fastpack15_32(in,out); return; + + case 16: __SIMD_fastpack16_32(in,out); return; + + case 17: __SIMD_fastpack17_32(in,out); return; + + case 18: __SIMD_fastpack18_32(in,out); return; + + case 19: __SIMD_fastpack19_32(in,out); return; + + case 20: __SIMD_fastpack20_32(in,out); return; + + case 21: __SIMD_fastpack21_32(in,out); return; + + case 22: __SIMD_fastpack22_32(in,out); return; + + case 23: __SIMD_fastpack23_32(in,out); return; + + case 24: __SIMD_fastpack24_32(in,out); return; + + case 25: __SIMD_fastpack25_32(in,out); return; + + case 26: __SIMD_fastpack26_32(in,out); return; + + case 27: __SIMD_fastpack27_32(in,out); return; + + case 28: __SIMD_fastpack28_32(in,out); return; + + case 29: __SIMD_fastpack29_32(in,out); return; + + case 30: __SIMD_fastpack30_32(in,out); return; + + case 31: __SIMD_fastpack31_32(in,out); return; + + case 32: __SIMD_fastpack32_32(in,out); return; + + default: break; + } +} + + + diff --git a/ext/simdcomp/src/simdcomputil.c b/ext/simdcomp/src/simdcomputil.c new file mode 100644 index 0000000..9b36da5 --- /dev/null +++ b/ext/simdcomp/src/simdcomputil.c @@ -0,0 +1,56 @@ +#include "../include/simdcomputil.h" + +__attribute__((always_inline)) +static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +} + + +// returns the integer logarithm of v (bit width) +uint32_t bits(const uint32_t v) { +#ifdef _MSC_VER + if (v == 0) { + return 0; + } + unsigned long answer; + _BitScanReverse(&answer, v); + return answer + 1; +#else + return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft +#endif +} + +__attribute__ ((pure)) +uint32_t maxbits(const uint32_t * begin) { + uint32_t accumulator = 0;const uint32_t * k; + for (k = begin; k != begin + SIMDBlockSize; ++k) { + accumulator |= *k; + } + return bits(accumulator); +} + +static uint32_t maxbitas32int(const __m128i accumulator) { + uint32_t tmparray[4]; + _mm_storeu_si128((__m128i *) (tmparray), accumulator); + return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); +} + + +// maxbit over 128 integers (SIMDBlockSize) with provided initial value +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) { + __m128i initoffset = _mm_set1_epi32 (initvalue); + const __m128i* pin = (const __m128i*)(in); + __m128i newvec = _mm_loadu_si128(pin); + __m128i accumulator = Delta(newvec , initoffset); + __m128i oldvec = newvec; + uint32_t k; + for(k = 1; 4*k < SIMDBlockSize; ++k) { + newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); +} + diff --git a/ext/simdcomp/src/simdintegratedbitpacking.c b/ext/simdcomp/src/simdintegratedbitpacking.c new file mode 100644 index 0000000..951bb85 --- /dev/null +++ b/ext/simdcomp/src/simdintegratedbitpacking.c @@ -0,0 +1,24872 @@ +/** + * This code is released under a BSD License. + */ +#include "../include/simdintegratedbitpacking.h" + +__attribute__((always_inline)) +static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +} + +__attribute__((always_inline)) +static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); + return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); +} + + +__m128i iunpack0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + (void) _in; + __m128i *out = (__m128i*)(_out); + const __m128i zero = _mm_set1_epi32 (0); + unsigned i; + for (i = 0; i < 8; ++i) { + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + } + + return initOffset; +} + + + + +void ipackwithoutmask0(__m128i initOffset , const uint32_t * _in , __m128i * out) { + (void) initOffset; + (void) _in; + (void) out; +} + + +void ipack0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { + (void) initOffset; + (void) _in; + (void) out; +} + + + +void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(3U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(7U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(15U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(31U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(63U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(127U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(255U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(511U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1023U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2047U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(4095U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(8191U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(16383U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(32767U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(65535U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(131071U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(262143U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(524287U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1048575U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2097151U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(4194303U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(8388607U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(16777215U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(33554431U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(67108863U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(134217727U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(268435455U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(536870911U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1073741823U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2147483647U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask32(__m128i initOffset , const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + (void) initOffset; + __m128i OutReg; + + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack32(__m128i initOffset , const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + (void) initOffset; + __m128i OutReg; + + + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + + +__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +__m128i iunpack32(__m128i initOffset, const __m128i* in, uint32_t * _out) { + (void) initOffset; + __m128i * mout = (__m128i *)(_out); + __m128i invec; + size_t k; + for(k = 0; k < 128/4; ++k) { + invec = _mm_loadu_si128(in++); + _mm_storeu_si128(mout++, invec); + } + return invec; +} + + + + + void simdunpackd1(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: iunpack0(initOffset,in,out); break; + + case 1: iunpack1(initOffset,in,out); break; + + case 2: iunpack2(initOffset,in,out); break; + + case 3: iunpack3(initOffset,in,out); break; + + case 4: iunpack4(initOffset,in,out); break; + + case 5: iunpack5(initOffset,in,out); break; + + case 6: iunpack6(initOffset,in,out); break; + + case 7: iunpack7(initOffset,in,out); break; + + case 8: iunpack8(initOffset,in,out); break; + + case 9: iunpack9(initOffset,in,out); break; + + case 10: iunpack10(initOffset,in,out); break; + + case 11: iunpack11(initOffset,in,out); break; + + case 12: iunpack12(initOffset,in,out); break; + + case 13: iunpack13(initOffset,in,out); break; + + case 14: iunpack14(initOffset,in,out); break; + + case 15: iunpack15(initOffset,in,out); break; + + case 16: iunpack16(initOffset,in,out); break; + + case 17: iunpack17(initOffset,in,out); break; + + case 18: iunpack18(initOffset,in,out); break; + + case 19: iunpack19(initOffset,in,out); break; + + case 20: iunpack20(initOffset,in,out); break; + + case 21: iunpack21(initOffset,in,out); break; + + case 22: iunpack22(initOffset,in,out); break; + + case 23: iunpack23(initOffset,in,out); break; + + case 24: iunpack24(initOffset,in,out); break; + + case 25: iunpack25(initOffset,in,out); break; + + case 26: iunpack26(initOffset,in,out); break; + + case 27: iunpack27(initOffset,in,out); break; + + case 28: iunpack28(initOffset,in,out); break; + + case 29: iunpack29(initOffset,in,out); break; + + case 30: iunpack30(initOffset,in,out); break; + + case 31: iunpack31(initOffset,in,out); break; + + case 32: iunpack32(initOffset,in,out); break; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: break; + + case 1: ipackwithoutmask1(initOffset,in,out); break; + + case 2: ipackwithoutmask2(initOffset,in,out); break; + + case 3: ipackwithoutmask3(initOffset,in,out); break; + + case 4: ipackwithoutmask4(initOffset,in,out); break; + + case 5: ipackwithoutmask5(initOffset,in,out); break; + + case 6: ipackwithoutmask6(initOffset,in,out); break; + + case 7: ipackwithoutmask7(initOffset,in,out); break; + + case 8: ipackwithoutmask8(initOffset,in,out); break; + + case 9: ipackwithoutmask9(initOffset,in,out); break; + + case 10: ipackwithoutmask10(initOffset,in,out); break; + + case 11: ipackwithoutmask11(initOffset,in,out); break; + + case 12: ipackwithoutmask12(initOffset,in,out); break; + + case 13: ipackwithoutmask13(initOffset,in,out); break; + + case 14: ipackwithoutmask14(initOffset,in,out); break; + + case 15: ipackwithoutmask15(initOffset,in,out); break; + + case 16: ipackwithoutmask16(initOffset,in,out); break; + + case 17: ipackwithoutmask17(initOffset,in,out); break; + + case 18: ipackwithoutmask18(initOffset,in,out); break; + + case 19: ipackwithoutmask19(initOffset,in,out); break; + + case 20: ipackwithoutmask20(initOffset,in,out); break; + + case 21: ipackwithoutmask21(initOffset,in,out); break; + + case 22: ipackwithoutmask22(initOffset,in,out); break; + + case 23: ipackwithoutmask23(initOffset,in,out); break; + + case 24: ipackwithoutmask24(initOffset,in,out); break; + + case 25: ipackwithoutmask25(initOffset,in,out); break; + + case 26: ipackwithoutmask26(initOffset,in,out); break; + + case 27: ipackwithoutmask27(initOffset,in,out); break; + + case 28: ipackwithoutmask28(initOffset,in,out); break; + + case 29: ipackwithoutmask29(initOffset,in,out); break; + + case 30: ipackwithoutmask30(initOffset,in,out); break; + + case 31: ipackwithoutmask31(initOffset,in,out); break; + + case 32: ipackwithoutmask32(initOffset,in,out); break; + + default: break; + } +} + + + + +void simdpackd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: break;; + + case 1: ipack1(initOffset, in,out); break; + + case 2: ipack2(initOffset, in,out); break; + + case 3: ipack3(initOffset, in,out); break; + + case 4: ipack4(initOffset, in,out); break; + + case 5: ipack5(initOffset, in,out); break; + + case 6: ipack6(initOffset, in,out); break; + + case 7: ipack7(initOffset, in,out); break; + + case 8: ipack8(initOffset, in,out); break; + + case 9: ipack9(initOffset, in,out); break; + + case 10: ipack10(initOffset, in,out); break; + + case 11: ipack11(initOffset, in,out); break; + + case 12: ipack12(initOffset, in,out); break; + + case 13: ipack13(initOffset, in,out); break; + + case 14: ipack14(initOffset, in,out); break; + + case 15: ipack15(initOffset, in,out); break; + + case 16: ipack16(initOffset, in,out); break; + + case 17: ipack17(initOffset, in,out); break; + + case 18: ipack18(initOffset, in,out); break; + + case 19: ipack19(initOffset, in,out); break; + + case 20: ipack20(initOffset, in,out); break; + + case 21: ipack21(initOffset, in,out); break; + + case 22: ipack22(initOffset, in,out); break; + + case 23: ipack23(initOffset, in,out); break; + + case 24: ipack24(initOffset, in,out); break; + + case 25: ipack25(initOffset, in,out); break; + + case 26: ipack26(initOffset, in,out); break; + + case 27: ipack27(initOffset, in,out); break; + + case 28: ipack28(initOffset, in,out); break; + + case 29: ipack29(initOffset, in,out); break; + + case 30: ipack30(initOffset, in,out); break; + + case 31: ipack31(initOffset, in,out); break; + + case 32: ipack32(initOffset, in,out); break; + + default: break; + } +} + diff --git a/ext/simdcomp/src/unit.c b/ext/simdcomp/src/unit.c new file mode 100644 index 0000000..826f447 --- /dev/null +++ b/ext/simdcomp/src/unit.c @@ -0,0 +1,63 @@ +/** + * This code is released under a BSD License. + */ +#include +#include +#include "simdcomp.h" + + +int main() { + int N = 5000 * SIMDBlockSize; + __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (int gap = 1; gap <= 387420489; gap *= 3) { + printf(" gap = %u \n", gap); + for (int k = 0; k < N; ++k) + datain[k] = k * gap; + uint32_t offset = 0; + for (int k = 0; k * SIMDBlockSize < N; ++k) { + ///////////////////////////// + // First part works for general arrays (sorted or unsorted) + ///////////////////////////// + // we compute the bit width + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + // we read 128 integers at "datain + k * SIMDBlockSize" and + // write b 128-bit vectors at "buffer" + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer + simdunpack(buffer, backbuffer, b);//uncompressed + for (int j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack\n"); + return -2; + } + } + ///////////////////////////// + // next part assumes that the data is sorted (uses differential coding) + ///////////////////////////// + // we compute the bit width + const uint32_t b1 = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + // we read 128 integers at "datain + k * SIMDBlockSize" and + // write b1 128-bit vectors at "buffer" + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, + b1); + // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer + simdunpackd1(offset, buffer, backbuffer, b1); + for (int j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack d1\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} diff --git a/ext/simple8b.c b/ext/simple8b.c new file mode 100644 index 0000000..3ac5615 --- /dev/null +++ b/ext/simple8b.c @@ -0,0 +1,330 @@ +// modified and optimized (speed + compression) by powturbo +// 64 bits version from: Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. +// Softw., Pract. Exper. 40(2): 131-147 (2010) +// http://ww2.cs.mu.oz.au/~alistair/coders-64bit/ + + #if defined(__x86_64__) || defined(__x86_32__) +static inline int bsr32(int x) { + int b = -1; + asm("bsrl %1,%0" : "+r" (b): "rm" (x) ); + return b + 1; +} + #else +static inline int bsr32(int x) { + return x?32 - __builtin_clz(x):0; +} + #endif + +#define WPUT(__x,__bit) { __bw |= (unsigned long long)(__x)<<__br; __br += __bit; } +#define WPUTZERO(__sel) { __bw = __br = 0; WPUT(__sel,4); } +#define WPUTFLUSH(__out) { *(typeof(__bw) *)__out = __bw; __out += sizeof(__bw)/sizeof(__out[0]); } + +#if 0 //WORD_SIZE==32 + #define CODE_TABLE \ + unsigned char sel2bit[]= { 0, 0, 0, 0, 0, 0, 0, 1 ,2,3,4,5,7,9,14,28}; \ + unsigned sel2elems[]= {256,120,90,60,50,40,32,28,14,9,7,5,4,3, 2, 1}; \ + + #define BIT_2_SEL \ + char bit2sel[]= { 0,7,8,9,10,11,12,12,13,13,14,14,14,14,14, \ + 15,15,15,15,15,15,15,15,15,15,15,15,15,15, \ + -1,-1,-1,-1}; + #define MAX_BIT 28 +#else +#define CODE_TABLE \ + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ \ +unsigned char sel2bit[]= { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60,61}; \ +unsigned sel2elems[]= {256,120,60,30,20,15,12,10, 8, 7, 6, 5, 4, 3, 2, 1}; \ +unsigned sellim[]= {256,120,60,60,60,60,60,60,56,56, 60, 60, 60, 60, 60, 60}; + +#define BIT_2_SEL char bit2sel[]= \ + {0,2,3,4,5,6,7,8, 9,10,10,11,11,12,12,12, \ + 13,13,13,13,13,14,14,14, 14,14,14,14,14,14,14,15, \ + 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, \ + 15,15,15,15,15,15,15,15, 15,15,15,15,15,-1, -1, -1, -1}; + + #define MAX_BIT 60 +#endif + +CODE_TABLE +BIT_2_SEL + +unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out) { + unsigned long long __bw; unsigned __br = 0; + unsigned char bits[0x1000]; + int elems; + int i,j; + for (i = 0; i < n; i++) + bits[i] = bsr32(in[i])+1; //CalcMinBits(in, bits, n); + int sel, bit,tmp; /*BLK_ENC_ADJUST*/ + for (i=0; i bit) { + tmp = bit2sel[bits[j]] ; + if(elems < sel2elems[ tmp ]) { + sel = tmp; + bit= sel2bit[sel]; + } else { + while ( elems < sel2elems[sel] ) sel++; + elems = sel2elems[sel]; + bit = sel2bit[sel]; + break; + } + } + elems++; + } + if (bit == 0) { /* can be downgrade to bit=1 */ + if (i+elems elems; sel++); + elems = sel2elems[sel]; + bit = sel2bit[sel]; + } else sel = 0; /* what a waste! */ + } else { + sel = bit2sel[bit]; + bit = sel2bit[sel]; + } + WPUTZERO(sel); + if (bit) { + for ( ; elems ; elems--, i++) WPUT(in[i],bit); + } else + i += elems; + WPUTFLUSH(out); + } + return out; +} + +#define MSK(__x) ((1ul<<__x)-1) +unsigned char *vs8bdec(unsigned char *__restrict in, int n, unsigned *__restrict out) { + unsigned char *ip = in; + unsigned i,*_out = out,*out_ = out+n; + while(out < out_) { + unsigned long long w = *(unsigned long long *)ip; + switch(w & 15) { + #if 1 + case 0: ip+=8; for(i=0; i<256; i++) out[i]= 1; out += 256; break; + #else + case 0: { int r = (w>>4)&0xf; ip++; if(r == 0xf) { r = (w>>8)&0xff; ip++; } while(r-->=0) *out++=0; } break; + #endif + + case 1: ip+=8; + for(i=0; i<120; i++) out[i]= 1; out += 120; + break; + case 2: ip+=8; + out[ 0]= (w >> 4) & MSK(1); + out[ 1]= (w >> 5) & MSK(1); + out[ 2]= (w >> 6) & MSK(1); + out[ 3]= (w >> 7) & MSK(1); + out[ 4]= (w >> 8) & MSK(1); + out[ 5]= (w >> 9) & MSK(1); + out[ 6]= (w >> 10) & MSK(1); + out[ 7]= (w >> 11) & MSK(1); + out[ 8]= (w >> 12) & MSK(1); + out[ 9]= (w >> 13) & MSK(1); + out[10]= (w >> 14) & MSK(1); + out[11]= (w >> 15) & MSK(1); + out[12]= (w >> 16) & MSK(1); + out[13]= (w >> 17) & MSK(1); + out[14]= (w >> 18) & MSK(1); + out[15]= (w >> 19) & MSK(1); + out[16]= (w >> 20) & MSK(1); + out[17]= (w >> 21) & MSK(1); + out[18]= (w >> 22) & MSK(1); + out[19]= (w >> 23) & MSK(1); + out[20]= (w >> 24) & MSK(1); + out[21]= (w >> 25) & MSK(1); + out[22]= (w >> 26) & MSK(1); + out[23]= (w >> 27) & MSK(1); + out[24]= (w >> 28) & MSK(1); + out[25]= (w >> 29) & MSK(1); + out[26]= (w >> 30) & MSK(1); + out[27]= (w >> 31) & MSK(1); + out[28]= (w >> 32) & MSK(1); + out[29]= (w >> 33) & MSK(1); + out[30]= (w >> 34) & MSK(1); + out[31]= (w >> 35) & MSK(1); + out[32]= (w >> 36) & MSK(1); + out[33]= (w >> 37) & MSK(1); + out[34]= (w >> 38) & MSK(1); + out[35]= (w >> 39) & MSK(1); + out[36]= (w >> 40) & MSK(1); + out[37]= (w >> 41) & MSK(1); + out[38]= (w >> 42) & MSK(1); + out[39]= (w >> 43) & MSK(1); + out[40]= (w >> 44) & MSK(1); + out[41]= (w >> 45) & MSK(1); + out[42]= (w >> 46) & MSK(1); + out[43]= (w >> 47) & MSK(1); + out[44]= (w >> 48) & MSK(1); + out[45]= (w >> 49) & MSK(1); + out[46]= (w >> 50) & MSK(1); + out[47]= (w >> 51) & MSK(1); + out[48]= (w >> 52) & MSK(1); + out[49]= (w >> 53) & MSK(1); + out[50]= (w >> 54) & MSK(1); + out[51]= (w >> 55) & MSK(1); + out[52]= (w >> 56) & MSK(1); + out[53]= (w >> 57) & MSK(1); + out[54]= (w >> 58) & MSK(1); + out[55]= (w >> 59) & MSK(1); + out[56]= (w >> 60) & MSK(1); + out[57]= (w >> 61) & MSK(1); + out[58]= (w >> 62) & MSK(1); + out[59]= (w >> 63) & MSK(1); out += 60; + break; + case 3: ip+=8; + out[ 0]= (w >> 4) & MSK(2); + out[ 1]= (w >> 6) & MSK(2); + out[ 2]= (w >> 8) & MSK(2); + out[ 3]= (w >> 10) & MSK(2); + out[ 4]= (w >> 12) & MSK(2); + out[ 5]= (w >> 14) & MSK(2); + out[ 6]= (w >> 16) & MSK(2); + out[ 7]= (w >> 18) & MSK(2); + out[ 8]= (w >> 20) & MSK(2); + out[ 9]= (w >> 22) & MSK(2); + out[10]= (w >> 24) & MSK(2); + out[11]= (w >> 26) & MSK(2); + out[12]= (w >> 28) & MSK(2); + out[13]= (w >> 30) & MSK(2); + out[14]= (w >> 32) & MSK(2); + out[15]= (w >> 34) & MSK(2); + out[16]= (w >> 36) & MSK(2); + out[17]= (w >> 38) & MSK(2); + out[18]= (w >> 40) & MSK(2); + out[19]= (w >> 42) & MSK(2); + out[20]= (w >> 44) & MSK(2); + out[21]= (w >> 46) & MSK(2); + out[22]= (w >> 48) & MSK(2); + out[23]= (w >> 50) & MSK(2); + out[24]= (w >> 52) & MSK(2); + out[25]= (w >> 54) & MSK(2); + out[26]= (w >> 56) & MSK(2); + out[27]= (w >> 58) & MSK(2); + out[28]= (w >> 60) & MSK(2); + out[29]= (w >> 62) & MSK(2); out += 30; + break; + case 4: ip+=8; + out[ 0]= (w >> 4) & MSK(3); + out[ 1]= (w >> 7) & MSK(3); + out[ 2]= (w >> 10) & MSK(3); + out[ 3]= (w >> 13) & MSK(3); + out[ 4]= (w >> 16) & MSK(3); + out[ 5]= (w >> 19) & MSK(3); + out[ 6]= (w >> 22) & MSK(3); + out[ 7]= (w >> 25) & MSK(3); + out[ 8]= (w >> 28) & MSK(3); + out[ 9]= (w >> 31) & MSK(3); + out[10]= (w >> 34) & MSK(3); + out[11]= (w >> 37) & MSK(3); + out[12]= (w >> 40) & MSK(3); + out[13]= (w >> 43) & MSK(3); + out[14]= (w >> 46) & MSK(3); + out[15]= (w >> 49) & MSK(3); + out[16]= (w >> 52) & MSK(3); + out[17]= (w >> 55) & MSK(3); + out[18]= (w >> 58) & MSK(3); + out[19]= (w >> 61) & MSK(3); out += 20; + break; + case 5: ip+=8; + out[ 0]= (w >> 4) & MSK(4); + out[ 1]= (w >> 8) & MSK(4); + out[ 2]= (w >> 12) & MSK(4); + out[ 3]= (w >> 16) & MSK(4); + out[ 4]= (w >> 20) & MSK(4); + out[ 5]= (w >> 24) & MSK(4); + out[ 6]= (w >> 28) & MSK(4); + out[ 7]= (w >> 32) & MSK(4); + out[ 8]= (w >> 36) & MSK(4); + out[ 9]= (w >> 40) & MSK(4); + out[10]= (w >> 44) & MSK(4); + out[11]= (w >> 48) & MSK(4); + out[12]= (w >> 52) & MSK(4); + out[13]= (w >> 56) & MSK(4); + out[14]= (w >> 60) & MSK(4); out += 15; + break; + case 6: ip+=8; + out[ 0]= (w >> 4) & MSK(5); + out[ 1]= (w >> 9) & MSK(5); + out[ 2]= (w >> 14) & MSK(5); + out[ 3]= (w >> 19) & MSK(5); + out[ 4]= (w >> 24) & MSK(5); + out[ 5]= (w >> 29) & MSK(5); + out[ 6]= (w >> 34) & MSK(5); + out[ 7]= (w >> 39) & MSK(5); + out[ 8]= (w >> 44) & MSK(5); + out[ 9]= (w >> 49) & MSK(5); + out[10]= (w >> 54) & MSK(5); + out[11]= (w >> 59) & MSK(5); out += 12; + break; + case 7: ip+=8; + out[0]= (w >> 4) & MSK(6); + out[1]= (w >> 10) & MSK(6); + out[2]= (w >> 16) & MSK(6); + out[3]= (w >> 22) & MSK(6); + out[4]= (w >> 28) & MSK(6); + out[5]= (w >> 34) & MSK(6); + out[6]= (w >> 40) & MSK(6); + out[7]= (w >> 46) & MSK(6); + out[8]= (w >> 52) & MSK(6); + out[9]= (w >> 58) & MSK(6); out += 10; + break; + case 8: ip+=8; + out[0]= (w >> 4 ) & MSK(7); + out[1]= (w >> 11) & MSK(7); + out[2]= (w >> 18) & MSK(7); + out[3]= (w >> 25) & MSK(7); + out[4]= (w >> 32) & MSK(7); + out[5]= (w >> 39) & MSK(7); + out[6]= (w >> 46) & MSK(7); + out[7]= (w >> 53) & MSK(7); out += 8; + break; + case 9: ip+=8; + out[0]= (w >> 4 ) & MSK(8); + out[1]= (w >> 12) & MSK(8); + out[2]= (w >> 20) & MSK(8); + out[3]= (w >> 28) & MSK(8); + out[4]= (w >> 36) & MSK(8); + out[5]= (w >> 44) & MSK(8); + out[6]= (w >> 52) & MSK(8); out += 7; + break; + case 10: ip+=8; + out[0]= (w >> 4) & MSK(10); + out[1]= (w >> 14) & MSK(10); + out[2]= (w >> 24) & MSK(10); + out[3]= (w >> 34) & MSK(10); + out[4]= (w >> 44) & MSK(10); + out[5]= (w >> 54) & MSK(10); out += 6; + break; + case 11: ip+=8; + out[0]= (w >> 4) & MSK(12); + out[1]= (w >> 16) & MSK(12); + out[2]= (w >> 28) & MSK(12); + out[3]= (w >> 40) & MSK(12); + out[4]= (w >> 52) & MSK(12); out += 5; + break; + case 12: ip+=8; + out[0]= (w >> 4) & MSK(15); + out[1]= (w >> 19) & MSK(15); + out[2]= (w >> 34) & MSK(15); + out[3]= (w >> 49) & MSK(15); out += 4; + break; + case 13: ip+=8; + out[0]= (w >> 4) & MSK(20); + out[1]= (w >> 24) & MSK(20); + out[2]= (w >> 44) & MSK(20); out += 3; + break; + case 14: ip+=8; + out[0]= (w >> 4) & MSK(30); + out[1]= (w >> 34) & MSK(30); out += 2; + break; + case 15: ip+=8; + out[0]= (w >> 4) & ((1ull<<60)-1); out += 1; + break; + } + } + return ip; +} diff --git a/ext/simple8b.h b/ext/simple8b.h new file mode 100644 index 0000000..1d387d5 --- /dev/null +++ b/ext/simple8b.h @@ -0,0 +1,2 @@ +unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out); +unsigned char *vs8bdec(unsigned char *__restrict in, int n, unsigned *__restrict out); diff --git a/ext/vabyte.h b/ext/vabyte.h new file mode 100644 index 0000000..eb73810 --- /dev/null +++ b/ext/vabyte.h @@ -0,0 +1,99 @@ +// "variablebyte.h" C Version port by powturbo from https://github.com/lemire/FastPFor +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#define extract7bits(i, val) (val >> (7 * i)) & ((1U << 7) - 1) +#define extract7bitsmaskless(i, val) (val >> (7 * i)) + +unsigned char *vbyteenc(unsigned *in, const size_t length, unsigned *out/*, + size_t &nvalue*/) { + unsigned char *bout = (unsigned char *)(out); + //const unsigned char *const initbout = (unsigned char *)(out); + //unsigned prev = 0; + size_t k; + for (k = 0; k < length; ++k) { + const unsigned val = /*delta ? in[k] - prev :*/ in[k]; + //if (delta) prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = (unsigned char)(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bitsmaskless(1,val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bits(1,val); + ++bout; + *bout = extract7bitsmaskless(2,val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits(0, val); + ++bout; + *bout = extract7bits(1, val); + ++bout; + *bout = extract7bits(2, val); + ++bout; + *bout = extract7bitsmaskless(3, val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bits(1,val); + ++bout; + *bout = extract7bits(2,val); + ++bout; + *bout = extract7bits(3,val); + ++bout; + *bout = extract7bitsmaskless(4,val) | (1U << 7); + ++bout; + } + } + /*while (needPaddingTo32Bits(bout)) { + *bout++ = 0; + } + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4;*/ + return bout; +} + + +unsigned char *vbytedec(const unsigned char *in, const size_t length, + unsigned *out/*, size_t &nvalue*/) { + unsigned prev = 0; + if (length == 0) { + //nvalue = 0; + return (unsigned char *)in;//abort + } + const unsigned char *inbyte = (const unsigned char *)(in); + const unsigned char *const endbyte = (const unsigned char *)(out + + length); + //const unsigned *const initout(out); + + while ((unsigned *)endbyte > out) { + unsigned int shift = 0; unsigned v; + for (v = 0; (unsigned *)endbyte > out; shift += 7) { + unsigned char c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + *out++ = /*delta ? (prev = v + prev) :*/ v; + break; + } + } + } + //nvalue = out - initout; + //inbyte = padTo32bits(inbyte); + return (unsigned char *)inbyte; + } + diff --git a/ext/varintg8iu.c b/ext/varintg8iu.c new file mode 100644 index 0000000..dfca2cc --- /dev/null +++ b/ext/varintg8iu.c @@ -0,0 +1,182 @@ +// C port Version of "VarIntG8IU.h" from https://github.com/lemire/FastPFor +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + */ +/** + * + * Implementation of varint-G8IU taken from + * Stepanov et al., SIMD-Based Decoding of Posting Lists, CIKM 2011 + * + * Update: D. Lemire believes that this scheme was patented by Rose, Stepanov et al. (patent 20120221539). + * We wrote this code before the patent was published (August 2012). + * + * By Maxime Caron + * From + * https://github.com/maximecaron/SIMD-Based-Posting-lists + * with minor modifications by D. Lemire. + */ +#include +#ifndef __SSSE3__ +#pragma message "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3" +#else +#ifndef VARINTG8IU_H__ +#define VARINTG8IU_H__ +#include +//#include "codecs.h" +#ifdef __GNUC__ +#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#else +#define PREDICT_FALSE(x) x +#define PREDICT_TRUE(x) x +#endif +#include "varintg8iu.h" + +typedef char v16qi __attribute__ ((vector_size (16))); + +static int maskOutputSize[256]; +static char mask[256][32]; + + int getNumByteNeeded(const uint32_t value) { + if (value > 0x000000FF) { + if (value > 0x0000FFFF) { + if (value > 0x00FFFFFF) { + return 4; + } else { + return 3; + } + } else { + return 2; + } + } else { + return 1; + } + } + + + // For all possible values of the + // descriptor we build a table of any shuffle sequence + // that might be needed at decode time. +void VarIntG8IU() { + for (int desc = 0; desc <= 255; desc++) { + int bitmask = 0x00000001; + int bitindex = 0; + // count number of 0 in the char + int complete = 0; + int ithSize[8]; + int lastpos = -1; + while (bitindex < 8) { + if ((desc & bitmask) == 0) { + ithSize[complete] = bitindex - lastpos; + lastpos = bitindex; + complete++; + } + bitindex++; + bitmask = bitmask << 1; + } + maskOutputSize[desc] = complete; + + int j = 0; + int k = 0; + for (int i = 0; i < complete; i++) { + for (int n = 0; n < 4; n++) { + if (n < ithSize[i]) { + mask[desc][k] = j; + j = j + 1; + } else { + mask[desc][k] = -1; + } + k = k + 1; + } + } + + } + + } + +unsigned char *vintg8enc(const uint32_t *__restrict in, const size_t length, unsigned char *__restrict out) { + const uint32_t *in_ = in + length; //size_t srclength = length * 4;unsigned char* dest = (unsigned char*)(out);size_t dstlength = length * 4; + //size_t compressed_size = 0; + while(in < in_ /*srclength > 0 && dstlength >= 9*/) { //compressed_size += encodeBlock(in, srclength, dst, nvalue); + unsigned char desc = 0xFF; + unsigned char bitmask = 0x01; + uint32_t buffer[8]; + int ithSize[8]; + int length = 0; + int numInt = 0; + + while (in < in_ /*srclength > 0*/) { + const uint32_t* temp = in; + int byteNeeded = getNumByteNeeded(*temp); + + if (PREDICT_FALSE(length + byteNeeded > 8)) { + break; + } + + //flip the correct bit in desc + bitmask = bitmask << (byteNeeded - 1); + desc = desc ^ bitmask; + bitmask = bitmask << 1; + + ithSize[numInt] = byteNeeded; + length += byteNeeded; + buffer[numInt] = *temp; + ++in;// = in + 1; + //srclength -= 4; + numInt++; + } + out[0] = desc; + int written = 1; + for(int i = 0; i < numInt; i++) { + int size = ithSize[i]; + uint32_t value = buffer[i]; + for (int j = 0; j < size; j++) { + out[written++] = value >> (j * 8); + } + } + out += 9; //dstlength -= 9; //compressed_size += 9; + } + //Ouput might not be a multiple of 4 so we make it so + return out; //out + ((compressed_size + 3)/ 4); + } + +unsigned char *vintg8dec(unsigned char *__restrict in, const size_t length, uint32_t *__restrict out) { + size_t srclength = length * 4; + const unsigned *out_ = out + length; //uint32_t * dest = out;size_t nvalue = length * 4; //uint32_t uncompressSize = 0; + while (out < out_ /*srclength >= 9*/) { //uncompressSize += decodeBlock(in, srclength, dst/*, nvalue*/); + const unsigned char* pdesc = in++; + unsigned char desc = *pdesc; + srclength -= 1; + + const unsigned char* peek = in; + v16qi data; + if (PREDICT_TRUE(srclength >= 16)) { + // read 16 byte of data only if we need to avoid cache miss + data = __builtin_ia32_lddqu((const char*) (peek)); + } else { + static char buff[16]; + memcpy(buff, peek, 8); + data = __builtin_ia32_lddqu(buff); + } + // load de required mask + v16qi shf = __builtin_ia32_lddqu(mask[desc]); + v16qi result = __builtin_ia32_pshufb128(data, shf); + char* dst = (char*) (out); + __builtin_ia32_storedqu(dst, result); + int readSize = maskOutputSize[desc]; + + if (PREDICT_TRUE( readSize >= 4)) { + v16qi shf2 = __builtin_ia32_lddqu(mask[desc] + 16); + v16qi result2 = __builtin_ia32_pshufb128(data, shf2); + __builtin_ia32_storedqu(dst + (16), result2); + } + // pop 8 input char + in += 8; srclength -= 8; out += readSize; //dstlength -= readSize * 4;// uncompressSize += readSize; + } + return in; //(uint32_t *) (((uintptr_t) (src) + 3) & ~3); + +} + +#endif //__SSE3__ +#endif diff --git a/ext/varintg8iu.h b/ext/varintg8iu.h new file mode 100644 index 0000000..48c8eac --- /dev/null +++ b/ext/varintg8iu.h @@ -0,0 +1,5 @@ +#include +void VarIntG8IU(); +unsigned char *vintg8enc(const uint32_t *__restrict in, const size_t length, unsigned char *__restrict out); +unsigned char *vintg8dec(unsigned char *__restrict in, const size_t length, uint32_t *__restrict out); + diff --git a/ext/vas16c.h b/ext/vas16c.h new file mode 100644 index 0000000..84fffd4 --- /dev/null +++ b/ext/vas16c.h @@ -0,0 +1,36 @@ +// optimized version from: http://jinruhe.com/ +static int s16_cnum[16] = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1}; +static int s16_cbits[16][28] = { + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0}, + {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} }; + +#define S16ENC(__w, __p, m) { unsigned *_p = __p, *_w = __w; \ + unsigned int _k, _j, _m, _o; \ + for (_k = 0; _k < 16; _k++) { \ + *_w = _k<<28; \ + _m = (s16_cnum[_k] < m)? s16_cnum[_k]:m; \ + for (_j = 0, _o = 0; (_j < _m) && (*(_p+_j) < (1<>28) {\ + case 0:\ + _p[ 0] = (_rw ) & 1;\ + _p[ 1] = (_rw>> 1) & 1;\ + _p[ 2] = (_rw>> 2) & 1;\ + _p[ 3] = (_rw>> 3) & 1;\ + _p[ 4] = (_rw>> 4) & 1;\ + _p[ 5] = (_rw>> 5) & 1;\ + _p[ 6] = (_rw>> 6) & 1;\ + _p[ 7] = (_rw>> 7) & 1;\ + _p[ 8] = (_rw>> 8) & 1;\ + _p[ 9] = (_rw>> 9) & 1;\ + _p[10] = (_rw>>10) & 1;\ + _p[11] = (_rw>>11) & 1;\ + _p[12] = (_rw>>12) & 1;\ + _p[13] = (_rw>>13) & 1;\ + _p[14] = (_rw>>14) & 1;\ + _p[15] = (_rw>>15) & 1;\ + _p[16] = (_rw>>16) & 1;\ + _p[17] = (_rw>>17) & 1;\ + _p[18] = (_rw>>18) & 1;\ + _p[19] = (_rw>>19) & 1;\ + _p[20] = (_rw>>20) & 1;\ + _p[21] = (_rw>>21) & 1;\ + _p[22] = (_rw>>22) & 1;\ + _p[23] = (_rw>>23) & 1;\ + _p[24] = (_rw>>24) & 1;\ + _p[25] = (_rw>>25) & 1;\ + _p[26] = (_rw>>26) & 1;\ + _p[27] = (_rw>>27) & 1; _p += 28;\ + break;\ + case 1: \ + _p[ 0] = (_rw ) & 3;\ + _p[ 1] = (_rw>> 2) & 3;\ + _p[ 2] = (_rw>> 4) & 3;\ + _p[ 3] = (_rw>> 6) & 3;\ + _p[ 4] = (_rw>> 8) & 3;\ + _p[ 5] = (_rw>>10) & 3;\ + _p[ 6] = (_rw>>12) & 3;\ + _p[ 7] = (_rw>>14) & 1;\ + _p[ 8] = (_rw>>15) & 1;\ + _p[ 9] = (_rw>>16) & 1;\ + _p[10] = (_rw>>17) & 1;\ + _p[11] = (_rw>>18) & 1;\ + _p[12] = (_rw>>19) & 1;\ + _p[13] = (_rw>>20) & 1;\ + _p[14] = (_rw>>21) & 1;\ + _p[15] = (_rw>>22) & 1;\ + _p[16] = (_rw>>23) & 1;\ + _p[17] = (_rw>>24) & 1;\ + _p[18] = (_rw>>25) & 1;\ + _p[19] = (_rw>>26) & 1;\ + _p[20] = (_rw>>27) & 1; _p += 21; \ + break; \ + case 2: \ + _p[0] = (_rw) & 1; \ + _p[1] = (_rw>>1) & 1;\ + _p[2] = (_rw>>2) & 1;\ + _p[3] = (_rw>>3) & 1;\ + _p[4] = (_rw>>4) & 1;\ + _p[5] = (_rw>>5) & 1;\ + _p[6] = (_rw>>6) & 1;\ + _p[7] = (_rw>>7) & 3;\ + _p[8] = (_rw>>9) & 3;\ + _p[9] = (_rw>>11) & 3;\ + _p[10] = (_rw>>13) & 3;\ + _p[11] = (_rw>>15) & 3;\ + _p[12] = (_rw>>17) & 3;\ + _p[13] = (_rw>>19) & 3;\ + _p[14] = (_rw>>21) & 1;\ + _p[15] = (_rw>>22) & 1;\ + _p[16] = (_rw>>23) & 1;\ + _p[17] = (_rw>>24) & 1;\ + _p[18] = (_rw>>25) & 1;\ + _p[19] = (_rw>>26) & 1;\ + _p[20] = (_rw>>27) & 1; _p += 21;\ + break; \ + case 3: \ + _p[0] = (_rw) & 1; \ + _p[1] = (_rw>>1) & 1;\ + _p[2] = (_rw>>2) & 1;\ + _p[3] = (_rw>>3) & 1;\ + _p[4] = (_rw>>4) & 1;\ + _p[5] = (_rw>>5) & 1;\ + _p[6] = (_rw>>6) & 1;\ + _p[7] = (_rw>>7) & 1;\ + _p[8] = (_rw>>8) & 1;\ + _p[9] = (_rw>>9) & 1;\ + _p[10] = (_rw>>10) & 1;\ + _p[11] = (_rw>>11) & 1;\ + _p[12] = (_rw>>12) & 1;\ + _p[13] = (_rw>>13) & 1;\ + _p[14] = (_rw>>14) & 3;\ + _p[15] = (_rw>>16) & 3;\ + _p[16] = (_rw>>18) & 3;\ + _p[17] = (_rw>>20) & 3;\ + _p[18] = (_rw>>22) & 3;\ + _p[19] = (_rw>>24) & 3;\ + _p[20] = (_rw>>26) & 3; _p += 21;\ + break; \ + case 4: \ + _p[ 0] = (_rw ) & 3;\ + _p[ 1] = (_rw>> 2) & 3;\ + _p[ 2] = (_rw>> 4) & 3;\ + _p[ 3] = (_rw>> 6) & 3;\ + _p[ 4] = (_rw>> 8) & 3;\ + _p[ 5] = (_rw>>10) & 3;\ + _p[ 6] = (_rw>>12) & 3;\ + _p[ 7] = (_rw>>14) & 3;\ + _p[ 8] = (_rw>>16) & 3;\ + _p[ 9] = (_rw>>18) & 3;\ + _p[10] = (_rw>>20) & 3;\ + _p[11] = (_rw>>22) & 3;\ + _p[12] = (_rw>>24) & 3;\ + _p[13] = (_rw>>26) & 3; _p += 14;\ + break; \ + case 5: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 7;\ + _p[2] = (_rw>>7) & 7;\ + _p[3] = (_rw>>10) & 7;\ + _p[4] = (_rw>>13) & 7;\ + _p[5] = (_rw>>16) & 7;\ + _p[6] = (_rw>>19) & 7;\ + _p[7] = (_rw>>22) & 7;\ + _p[8] = (_rw>>25) & 7; _p += 9;\ + break; \ + case 6: \ + _p[0] = (_rw) & 7; \ + _p[1] = (_rw>>3) & 15;\ + _p[2] = (_rw>>7) & 15;\ + _p[3] = (_rw>>11) & 15;\ + _p[4] = (_rw>>15) & 15;\ + _p[5] = (_rw>>19) & 7;\ + _p[6] = (_rw>>22) & 7;\ + _p[7] = (_rw>>25) & 7; _p += 8;\ + break; \ + case 7: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 15;\ + _p[2] = (_rw>>8) & 15;\ + _p[3] = (_rw>>12) & 15;\ + _p[4] = (_rw>>16) & 15;\ + _p[5] = (_rw>>20) & 15;\ + _p[6] = (_rw>>24) & 15; _p += 7;\ + break; \ + case 8: \ + _p[0] = (_rw ) & 31;\ + _p[1] = (_rw>> 5) & 31;\ + _p[2] = (_rw>>10) & 31;\ + _p[3] = (_rw>>15) & 31;\ + _p[4] = (_rw>>20) & 15;\ + _p[5] = (_rw>>24) & 15; _p += 6;\ + break; \ + case 9: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 15;\ + _p[2] = (_rw>>8) & 31;\ + _p[3] = (_rw>>13) & 31;\ + _p[4] = (_rw>>18) & 31;\ + _p[5] = (_rw>>23) & 31; _p += 6;\ + break; \ + case 10: \ + _p[0] = (_rw) & 63; \ + _p[1] = (_rw>>6) & 63;\ + _p[2] = (_rw>>12) & 63;\ + _p[3] = (_rw>>18) & 31;\ + _p[4] = (_rw>>23) & 31; _p += 5;\ + break; \ + case 11: \ + _p[0] = (_rw) & 31; \ + _p[1] = (_rw>>5) & 31;\ + _p[2] = (_rw>>10) & 63;\ + _p[3] = (_rw>>16) & 63;\ + _p[4] = (_rw>>22) & 63; _p += 5;\ + break; \ + case 12: \ + _p[0] = (_rw) & 127; \ + _p[1] = (_rw>>7) & 127;\ + _p[2] = (_rw>>14) & 127;\ + _p[3] = (_rw>>21) & 127; _p += 4;\ + break; \ + case 13: \ + _p[0] = (_rw) & 1023; \ + _p[1] = (_rw>>10) & 511;\ + _p[2] = (_rw>>19) & 511; _p += 3;\ + break; \ + case 14: \ + _p[0] = (_rw) & 16383; \ + _p[1] = (_rw>>14) & 16383; _p += 2;\ + break; \ + case 15: \ + _p[0] = (_rw) & ((1<<28)-1); _p++; \ + break; \ + } \ +} +static inline unsigned char *vs16dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n; while(out < out_) S16DEC(in, out, ;); return (unsigned char *)in; } + +#if 0 +#define BREAK _rw = *_in++; goto *_lab[__out<_oute?((_rw)>>28):16] + +#define s16dec(__in, __n, __pout) ({\ + __label__ _lab0,_lab1,_lab2,_lab3,_lab4,_lab5,_lab6,_lab7,_lab8,_lab9,_lab10,_lab11,_lab12,_lab13,_lab14,_lab15,_labend;\ + static void *_lab[] = { &&_lab0, &&_lab1, &&_lab2, &&_lab3, &&_lab4, &&_lab5, &&_lab6, &&_lab7, &&_lab8, &&_lab9, &&_lab10, &&_lab11, &&_lab12, &&_lab13, &&_lab14, &&_lab15, &&_labend };\ + unsigned *_in = __in; typeof(__pout[0]) *__out = __pout, *_oute = __out+(__n); register unsigned _rw = *_in++; goto *_lab[(_rw)>>28];\ + _lab0:\ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 1; \ + __out[8] = (_rw>>8) & 1; \ + __out[9] = (_rw>>9) & 1; \ + __out[10] = (_rw>>10) & 1; \ + __out[11] = (_rw>>11) & 1; \ + __out[12] = (_rw>>12) & 1; \ + __out[13] = (_rw>>13) & 1; \ + __out[14] = (_rw>>14) & 1; \ + __out[15] = (_rw>>15) & 1; \ + __out[16] = (_rw>>16) & 1; \ + __out[17] = (_rw>>17) & 1; \ + __out[18] = (_rw>>18) & 1; \ + __out[19] = (_rw>>19) & 1; \ + __out[20] = (_rw>>20) & 1; \ + __out[21] = (_rw>>21) & 1; \ + __out[22] = (_rw>>22) & 1; \ + __out[23] = (_rw>>23) & 1; \ + __out[24] = (_rw>>24) & 1; \ + __out[25] = (_rw>>25) & 1; \ + __out[26] = (_rw>>26) & 1; \ + __out[27] = (_rw>>27) & 1; __out += 28;\ + BREAK; \ + _lab1: \ + __out[0] = (_rw) & 3; \ + __out[1] = (_rw>>2) & 3; \ + __out[2] = (_rw>>4) & 3; \ + __out[3] = (_rw>>6) & 3; \ + __out[4] = (_rw>>8) & 3; \ + __out[5] = (_rw>>10) & 3; \ + __out[6] = (_rw>>12) & 3; \ + __out[7] = (_rw>>14) & 1; \ + __out[8] = (_rw>>15) & 1; \ + __out[9] = (_rw>>16) & 1; \ + __out[10] = (_rw>>17) & 1; \ + __out[11] = (_rw>>18) & 1; \ + __out[12] = (_rw>>19) & 1; \ + __out[13] = (_rw>>20) & 1; \ + __out[14] = (_rw>>21) & 1; \ + __out[15] = (_rw>>22) & 1; \ + __out[16] = (_rw>>23) & 1; \ + __out[17] = (_rw>>24) & 1; \ + __out[18] = (_rw>>25) & 1; \ + __out[19] = (_rw>>26) & 1; \ + __out[20] = (_rw>>27) & 1; __out += 21; \ + BREAK; \ + _lab2: \ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 3; \ + __out[8] = (_rw>>9) & 3; \ + __out[9] = (_rw>>11) & 3; \ + __out[10] = (_rw>>13) & 3; \ + __out[11] = (_rw>>15) & 3; \ + __out[12] = (_rw>>17) & 3; \ + __out[13] = (_rw>>19) & 3; \ + __out[14] = (_rw>>21) & 1; \ + __out[15] = (_rw>>22) & 1; \ + __out[16] = (_rw>>23) & 1; \ + __out[17] = (_rw>>24) & 1; \ + __out[18] = (_rw>>25) & 1; \ + __out[19] = (_rw>>26) & 1; \ + __out[20] = (_rw>>27) & 1; __out += 21;\ + BREAK; \ + _lab3: \ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 1; \ + __out[8] = (_rw>>8) & 1; \ + __out[9] = (_rw>>9) & 1; \ + __out[10] = (_rw>>10) & 1; \ + __out[11] = (_rw>>11) & 1; \ + __out[12] = (_rw>>12) & 1; \ + __out[13] = (_rw>>13) & 1; \ + __out[14] = (_rw>>14) & 3; \ + __out[15] = (_rw>>16) & 3; \ + __out[16] = (_rw>>18) & 3; \ + __out[17] = (_rw>>20) & 3; \ + __out[18] = (_rw>>22) & 3; \ + __out[19] = (_rw>>24) & 3; \ + __out[20] = (_rw>>26) & 3; __out += 21;\ + BREAK; \ + _lab4: \ + __out[0] = (_rw) & 3; \ + __out[1] = (_rw>>2) & 3; \ + __out[2] = (_rw>>4) & 3; \ + __out[3] = (_rw>>6) & 3; \ + __out[4] = (_rw>>8) & 3; \ + __out[5] = (_rw>>10) & 3; \ + __out[6] = (_rw>>12) & 3; \ + __out[7] = (_rw>>14) & 3; \ + __out[8] = (_rw>>16) & 3; \ + __out[9] = (_rw>>18) & 3; \ + __out[10] = (_rw>>20) & 3; \ + __out[11] = (_rw>>22) & 3; \ + __out[12] = (_rw>>24) & 3; \ + __out[13] = (_rw>>26) & 3; __out += 14;\ + BREAK; \ + _lab5: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 7; \ + __out[2] = (_rw>>7) & 7; \ + __out[3] = (_rw>>10) & 7; \ + __out[4] = (_rw>>13) & 7; \ + __out[5] = (_rw>>16) & 7; \ + __out[6] = (_rw>>19) & 7; \ + __out[7] = (_rw>>22) & 7; \ + __out[8] = (_rw>>25) & 7; __out += 9;\ + BREAK; \ + _lab6: \ + __out[0] = (_rw) & 7; \ + __out[1] = (_rw>>3) & 15; \ + __out[2] = (_rw>>7) & 15; \ + __out[3] = (_rw>>11) & 15; \ + __out[4] = (_rw>>15) & 15; \ + __out[5] = (_rw>>19) & 7; \ + __out[6] = (_rw>>22) & 7; \ + __out[7] = (_rw>>25) & 7; __out += 8;\ + BREAK; \ + _lab7: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 15; \ + __out[2] = (_rw>>8) & 15; \ + __out[3] = (_rw>>12) & 15; \ + __out[4] = (_rw>>16) & 15; \ + __out[5] = (_rw>>20) & 15; \ + __out[6] = (_rw>>24) & 15; __out += 7;\ + BREAK; \ + _lab8: \ + __out[0] = (_rw) & 31; \ + __out[1] = (_rw>>5) & 31; \ + __out[2] = (_rw>>10) & 31; \ + __out[3] = (_rw>>15) & 31; \ + __out[4] = (_rw>>20) & 15; \ + __out[5] = (_rw>>24) & 15; __out += 6;\ + BREAK; \ + _lab9: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 15; \ + __out[2] = (_rw>>8) & 31; \ + __out[3] = (_rw>>13) & 31; \ + __out[4] = (_rw>>18) & 31; \ + __out[5] = (_rw>>23) & 31; __out += 6;\ + BREAK; \ + _lab10: \ + __out[0] = (_rw) & 63; \ + __out[1] = (_rw>>6) & 63; \ + __out[2] = (_rw>>12) & 63; \ + __out[3] = (_rw>>18) & 31; \ + __out[4] = (_rw>>23) & 31; __out += 5;\ + BREAK; \ + _lab11: \ + __out[0] = (_rw) & 31; \ + __out[1] = (_rw>>5) & 31; \ + __out[2] = (_rw>>10) & 63; \ + __out[3] = (_rw>>16) & 63; \ + __out[4] = (_rw>>22) & 63; __out += 5;\ + BREAK; \ + _lab12: \ + __out[0] = (_rw) & 127; \ + __out[1] = (_rw>>7) & 127; \ + __out[2] = (_rw>>14) & 127; \ + __out[3] = (_rw>>21) & 127; __out += 4;\ + BREAK; \ + _lab13: \ + __out[0] = (_rw) & 1023; \ + __out[1] = (_rw>>10) & 511; \ + __out[2] = (_rw>>19) & 511; __out += 3;\ + BREAK; \ + _lab14:\ + __out[0] = (_rw) & 16383; \ + __out[1] = (_rw>>14) & 16383; __out += 2;\ + BREAK; \ + _lab15:\ + __out[0] = (_rw) & ((1<<28)-1); __out++; \ + BREAK;\ + _labend:;(_in-1);\ +}) +#endif diff --git a/ext/vbyte_poly.h b/ext/vbyte_poly.h new file mode 100644 index 0000000..3c2668d --- /dev/null +++ b/ext/vbyte_poly.h @@ -0,0 +1,46 @@ +// +#define VBYTE_ENC(_v, _n) \ +{\ + unsigned _num; \ + unsigned char _barray[5]; \ + unsigned _i, _started = 0; \ + _num = _n; \ + for (_i = 0; _i < 5; _i++) \ + { \ + _barray[_i] = ((_num%128)<<1); \ + _num = _num/128; \ + } \ + for (_i = 4; _i > 0; _i--) \ + { \ + if ((_barray[_i] != 0) || (_started == 1)) \ + { \ + _started = 1; \ + *_v = _barray[_i]|0x1; \ + _v++; \ + } \ + } \ + *_v = _barray[0]|0x0; \ + _v++; \ +} + +#define VBYTE_DEC(_v, _n) \ +{\ + _n = ((*_v>>1)); \ + if ((*_v&0x1) != 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + if ((*_v&0x1)!= 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + if ((*_v&0x1) != 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + }\ + }\ + }\ + _v++; \ +} + diff --git a/icbench.c b/icbench.c index d417e9f..d073c76 100644 --- a/icbench.c +++ b/icbench.c @@ -1,7 +1,7 @@ /** Copyright (C) powturbo 2013-2014 GPL v2 License - + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or @@ -16,321 +16,351 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo icbench.c - "Integer Compression" benchmark program **/ - + +#define _LARGEFILE64_SOURCE 1 +#define _FILE_OFFSET_BITS 64 #include #include #include #include - +#include + #include #include -#define PGM_FD(__f) struct stat sbuf; fstat(__f, &sbuf); __off64_t vlen = sbuf.st_size, vtel = 0; int pgm = 0; time_t t0 = time(NULL); -#define PGM_FDPUT(__f) vtel = lseek(__f, 0, SEEK_CUR);if(vtel*10/vlen != pgm) { double secs = time(NULL) - t0; pgm = vtel*10/vlen; printf("%d%%%.1f ", pgm, ((secs/60.0) * (vlen - vtel))/vtel); fflush(stdout); } -//------------------------------------------------------------------------------------------------------------- +#include + +// simple-8b simple16 optpfd don't work with all interger lists. +// Enable if you to want to test +//#define USE_SIMPLE_8B // crashs on some lists +//#define USE_SIMPLE16 // limited to 28 bits +//#define USE_OPTPFD // compression too slow and limited to 28 bits. crashs on some lists +#define STATS +//---------------------------------------- Platform ------------------------ + #ifdef _WIN32 +#define srand48(x) srand(x) +#define drand48() ((double)(rand()) / RAND_MAX) +#define __off64_t _off64_t + #endif +//---------------------------------------- Time --------------------------------------------------------------------- typedef unsigned long long tm_t; #define TM_TMAX (1ull<<63) - #ifdef _MSC_VER // __rdtsc -#include - #else -#include - #endif - - #ifdef _WIN32 -#include -#define TM_T 1 - -static tm_t tmtime(void) { - LARGE_INTEGER tm; - QueryPerformanceCounter(&tm); - return (tm_t)(tm.QuadPart/tps.QuadPart); -} - -LARGE_INTEGER tps; -static tm_t tminit() { QueryPerformanceFrequency(&tps); tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } - #else #include #define TM_T 1000000.0 -static tm_t tmtime(void) { - struct timeval tm; - gettimeofday(&tm, NULL); - return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; -} +static tm_t tmtime(void) { struct timeval tm; gettimeofday(&tm, NULL); return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; } +static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } +static double tmsec( tm_t tm) { return (double)tm/1000000.0; } +static double tmmsec(tm_t tm) { return (double)tm/1000.0; } -static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } - #endif -//-------------------------------------------------------------------------------------------------------- -#include "vint.h" +//-------------------------------------- TurboPFor ------------------------------------------------------------------ +#include "vint.h" #include "vsimple.h" #include "bitpack.h" #include "bitunpack.h" + #include "vp4dc.h" #include "vp4dd.h" -#include "aux/vas16c.h" -#include "aux/vas16d.h" -#include "aux/OPT_PFD/opt_p4.h" -#include "aux/vabyte.h" -#include "aux/simple8b.h" -#include "aux/varintg8iu.h" +unsigned char *u32enc(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *in_ = in +n; while(in < in_) *out++ = *in++; return (unsigned char *)out;} +unsigned char *u32dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return (unsigned char *)in; } + +#define PAD8(__x) (((__x)+7)/8) +unsigned char *_bitunpackx32(unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); } + +unsigned char *bitdunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); } +unsigned char *bitd0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); } +unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); } +unsigned char *bitf0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); } +//-------------------------------------- External functions for comparison ------------------------------------------------------------------------ +#include "ext/vas16c.h" +#include "ext/vas16d.h" +#include "ext/OPT_PFD/opt_p4.h" +#include "ext/vabyte.h" +#include "ext/simple8b.h" +#include "ext/varintg8iu.h" +#include "ext/varintg8iu.h" +#include "ext/simdcomp/include/simdbitpacking.h" unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; uint32_t *in_; for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b); - return out; + return (unsigned char *)out; } unsigned char *simdpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; uint32_t *in_; for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpack(in, (__m128i *)out, b); - return out; + return (unsigned char *)out; } unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { uint32_t k, *out_; - for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack(in, out, b); - return in; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack((const __m128i *)in, out, b); + return (unsigned char *)in; } unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; uint32_t *in_; for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); //simdpackwithoutmaskd1(x, ip+1, (__m128i *)out, b); - return out; + return (unsigned char *)out; } unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { uint32_t k, *out_; for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, in, out, b); - return in; + return (unsigned char *)in; } -unsigned char *u32enc(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) *out++ = *in++; return out;} -unsigned char *u32dec(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return in;} - -#include "aux/vbyte_poly.h" -unsigned char *vavbyte1enc(int *in, int n, unsigned char *out) { +#include "ext/vbyte_poly.h" +unsigned char *vbpolyenc(int *in, int n, unsigned char *out) { int i; for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } return out; } -void vavbyte1dec(unsigned char *in, int n, int *out) { - int i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return out; +unsigned char *vbpolydec(unsigned char *in, int n, int *out) { + int i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return in; } - -//------------------------------------------------------------------------------------------------- -#define VBLIM 64 +//------------------------------------------------------------------------------------------------------------------- +#define BLK_SIZE (64*1024) +#define PACK_SIZE 128 enum { P_CPY, - P_VB, P_VBL, P_VG8, + P_VB, P_VBL, P_VG8, P_VBP, P_VBI, P_PCK, P_PCKR, P_SIMDH, - P_SV, P_S16, P_S8BO, - P_P4D, P_P4DR, P_OPTP4 -}; - -unsigned char *beenc(unsigned *in, size_t n, unsigned char *out, int id, int bb) { - unsigned *ip=in; - int i,b; + P_SV, P_S16, P_S64, + P_P4D, P_P4DR, P_OPTP4, +}; +//------------------------------------------------ random integer array (not sorted) --------------------------------------------------------------------------- +unsigned char *beenc(unsigned *in, size_t n, unsigned char *out, int id, int b) { int i; switch(id) { - case P_CPY: - out = u32enc( ip, n, out); break; - case P_VB: - out = vbenc( ip, n, out); break; - case P_VBL: - out = vbyteenc( ip, n, out); break; - case P_VG8: - out = vintg8enc(ip, n, out); break; + case P_CPY: return u32enc( in, n, (unsigned *)out); - //----------- simple ------------------- - case P_SV: - out = vsenc32( ip, n, out); break; - case P_S16: - { unsigned *c=ip,*ce=c+n; - while(c < ce) S16ENC(out, c, ce - c); - } - break; - case P_S8BO: - out = s8benco( ip, n, out); - break; + // --------- variable byte ------------------------------------------- + case P_VB: return vbenc( in, n, out); - //----------- PFOR ------------------- + case P_VBL: return vbyteenc( in, n, (unsigned *)out); + case P_VG8: return vintg8enc(in, n, out); + case P_VBP: return vbpolyenc(in, n, out); + // --------- simple family: simple16, simpleV, simple64 --------------- + case P_SV: return vsenc32( in, n, out); + + case P_S16: return vs16enc( in, n, (unsigned *)out); + case P_S64: return vs8benc( in, n, out); + // --------- PFor ----------------------------------------------------- case P_P4DR: - case P_P4D: - if(n>= 5; - } - *op = x; - in = bitunpack32( in, n-1, b, op+1); - } - break; - case P_PCKR: - { - unsigned x; - vbgeta(in, x, ;); - if(bb < 0) { - b = x & 0x1f; x >>= 5; - } - *op = x; - in = _bitunpackx32(in, n-1, b, op+1); - } - break; - case P_SIMDH: - if(n <129) in = vbytedec(in, n, op); - else { - unsigned x; - vbgeta(in, x, ;); - if(bb < 0) { - b = x & 0x1f; x >>= 5; - } - *op = x; - in = simdunpackn( in, n-1, b, op+1); - } - break; - default: printf("Fatal- Not entry %d", id); exit(0); +unsigned char *bedec(unsigned char *in, size_t n, unsigned *out, int id, int b) { + switch(id) { + case P_CPY: return u32dec( (unsigned *)in, n, out); + // --------- variable byte ------------------------------------------- + case P_VB: return vbdec( in, n, out); + + case P_VBL: return vbytedec( in, n, out); + case P_VG8: return vintg8dec(in, n, out); + case P_VBP: return vbpolydec(in, n, out); + + // --------- simple family: simple16, simpleV, simple64 --------------- + case P_SV: return vsdec32( in, n, out); + + case P_S16: return vs16dec( (unsigned *)in, n, out); + case P_S64: return vs8bdec( in, n, out); + + // --------- PFor ----------------------------------------------------- + case P_OPTP4 : if(n < 128) return vbytedec(in, n, out); else { unsigned all_array[2048]; return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); } + case P_P4D : return p4ddec32( in, n, out); + case P_P4DR : return p4ddecx32( in, n, out); + + // --------- bit packing ------------------------------------------- + case P_PCK: if(b < 0) b = *in++; return bitunpack32( in, n, b, out); + case P_PCKR: if(b < 0) b = *in++; return _bitunpackx32( in, n, b, out); + + case P_SIMDH: + if(n < 128) return vbytedec(in, n, out); + else { if(b < 0) b = *in++; return simdunpackn( (unsigned *)in, n, b, out); } + default: die("Fatal- Not entry %d", id); } return in; } -struct libss { int id; char *s,*v; }; +//------------------------------------------------- Sorted integer array : Delta/Differential compression ------------------------------------------------ +//#define DELTA(in, n, mode, pa) for(pa[0]=in[0],v = 1; v < n; v++) pa[v] = in[v] - in[v-1] - mode +#define DELTA( __in, __n, __mode, __pa) { unsigned _v; for( __pa[0]=__in[0],_v = __n-1; _v > 0; --_v) __pa[_v] = (__in[_v] - __in[_v-1]) - __mode; } +#define DELTAB(__in, __n, __mode, __b, __pa) { unsigned _v; for(__b=0,__pa[0]=__in[0],_v = __n-1; _v > 0; --_v) __pa[_v] = (__in[_v] - __in[_v-1]) - __mode, __b |= __pa[_v]; __b = bsr32(__b); } + +#define DELTR( __in, __n, __mode, __pa) { unsigned _v; for( __pa[0]=__in[0],_v = 1; _v < __n; _v++) __pa[_v] = (__in[_v] - __pa[0]) - _v*__mode; } +#define DELTRB(__in, __n, __mode, __b, __pa) { unsigned _v; for(__b=0,__pa[0]=__in[0],_v = 1; _v < __n; _v++) __pa[_v] = (__in[_v] - __pa[0]) - _v*__mode, __b |= __pa[_v]; __b = bsr32(__b); } + +unsigned char *besenc(unsigned *in, size_t n, unsigned char *out, int id, int mode) { + unsigned pa[BLK_SIZE+2048],x; unsigned b; + + switch(id) { + case P_CPY: return u32enc( in, n, (unsigned *)out); + //----------- Variable byte ---------------------------------------------------------------------------------------- + case P_VB: DELTA( in, n, mode, pa); return vbenc( pa, n, out); + + case P_VBL: DELTA( in, n, mode, pa); return vbyteenc( pa, n, (unsigned *)out); + case P_VBP: DELTA( in, n, mode, pa); return vbpolyenc(pa, n, out); + case P_VG8: DELTA( in, n, mode, pa); return vintg8enc(pa, n, out); + // --------- Simple family --------- + case P_SV: DELTA( in, n, mode, pa); vbput(out, pa[0]); return vsenc32( pa+1, n-1, out); + + case P_S16: DELTAB(in, n, mode, b, pa); if(b>28) die("simple16 overflow.bits size>28\n"); + vbput(out, pa[0]); return vs16enc( pa+1, n-1, (unsigned *)out); + case P_S64: DELTA( in, n, mode, pa); if(b>28) die("simple-8b overflow.bits size>28\n"); + vbput(out, pa[0]); return vs8benc( pa+1, n-1, out); + // --------- PFor ------------------------------------------------------------------------------------------------- + case P_P4D: DELTA( in, n, mode, pa); vbput(out, pa[0]); return p4denc32( pa+1, n-1, out); + case P_P4DR: DELTR( in, n, mode, pa); vbput(out, pa[0]); return p4denc32( pa+1, n-1, out); + + case P_OPTP4: DELTAB(in, n, mode, b, pa); if(b>28) die("optp4 overflow.bits size>28\n"); + if(n < 129) { return vbenc(pa, n, out); } + else { vbput(out, pa[0]); return out + OPT4(pa+1, n-1, (unsigned *)out); } + // --------- bit packing ----------------------------------------------------------------------------------------------- + case P_PCK: DELTAB(in, n, mode, b, pa); vbput(out, pa[0]); *out++=b; return bitpack32(pa+1, n-1, b, out); + case P_PCKR: DELTRB(in, n, mode, b, pa); vbput(out, pa[0]); *out++=b; return bitpack32(pa+1, n-1, b, out); + + case P_SIMDH: + if(n < 129) { DELTA(in, n, mode, pa); return vbyteenc((unsigned *)pa, n, (unsigned *)out); } + else { b = simdmaxbitsd1(in[0], in+1); vbput(out, in[0]); *out++=b; return simdpackwn1((unsigned *)(in+1), n-1, b, in[0], (unsigned *)out); } + } +} + +#define UNDELTA(__out, __n, __mode) { unsigned _x,_v; for(_x = __out[0],_v=1;_v<__n;_v++) __out[_v] = (_x += __out[_v] + __mode); } + +unsigned char *besdec(unsigned char *in, size_t n, unsigned *out, int id, int mode) { unsigned b,x,v; + switch(id) { + case P_CPY: in = u32dec( (unsigned *)in, n, out); break; + //------------- Variable byte ---------------------------------------------- + case P_VB: in = vbdec( in, n, out); UNDELTA(out, n, mode); break; + + case P_VBL: in = vbytedec( in, n, out); UNDELTA(out, n, mode); break; + case P_VBP: in = vbpolydec( in, n, out); UNDELTA(out, n, mode); break; + case P_VG8: in = vintg8dec( in, n, out); UNDELTA(out, n, mode); break; + //------------- Simple family ---------------------------------------------- + case P_SV: vbgeta(in, x, *out = x); in = vsdec32( in, n-1, out+1); UNDELTA(out, n, mode); break; + + case P_S16: vbgeta(in, x, *out = x); in = vs16dec((unsigned *)in, n-1, out+1); UNDELTA(out, n, mode); break; + case P_S64: vbgeta(in, x, *out = x); in = vs8bdec( in, n-1, out+1); UNDELTA(out, n, mode); break; + // ------------ PFor ------------------------------------------------------- + case P_P4D: vbgeta(in, x, *out = x); in = p4ddec32( in, n-1, out+1); UNDELTA(out, n, mode); break; + case P_P4DR: vbgeta(in, x, *out = x); return mode?p4dfdecx32(in, n-1, x, out+1):p4df0decx32( in, n-1, x, out+1); + + case P_OPTP4: + if(n < 129) in = vbdec(in, n, out); + else { vbgeta(in, x, *out = x); unsigned all_array[2048]; in = (unsigned char *)detailed_p4_decode(out+1, (unsigned *)in, all_array); } + UNDELTA(out, n, mode); + break; + // --------- bit packing ---------------------------------------- + case P_PCK: vbgeta(in, x, *out = x); b = *in++; return mode?bitdunpack32( in, n-1, b, x, out+1):bitd0unpack32( in, n-1, b, x, out+1); + case P_PCKR: vbgeta(in, x, *out = x); b = *in++; return mode?bitfunpackx32(in, n-1, b, x, out+1):bitf0unpackx32(in, n-1, b, x, out+1); + + case P_SIMDH: + if(n < 129) { in = vbytedec(in, n, out); UNDELTA(out, n, mode); } + else { vbgeta(in, x, *out = x); b = *in++; in = simdunpackn1((uint32_t *)in, n-1, b, out[0], out+1); } + break; + } + return in; +} + +//--------------------------------------- Zipfian generator -------------------------------------------------------- +int z_cmp(double **a, double **b) { + if(*a < *b) return -1; + if(*a > *b) return 1; + return 0; +} + +void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) { + int i,m = x2 - x1 + 1; + double prob, cum, *zmap; + if(!(zmap = malloc(m*sizeof(zmap[0])))) die("mallo error\n"); + + // generate initial set (slow) + srand48(1); + for(cum = 0.0,i = 0; i < m; i++) + cum += 1.0 / pow(i+1, alpha); + cum = 1.0 / cum; + for(prob = 0.0,i = 0; i < m; i++) + zmap[i] = prob += cum / pow(i+1, alpha); + + // use binary search to speed up zipfgen + qsort(zmap, m, sizeof(zmap[0]), (int(*)(const void*,const void*))z_cmp); + for(i = 0; i < n; i++) { + double r = drand48(); + int l = 0, h = m-1; + while(l < h) { + int k = (l + h) >> 1; + if(r > zmap[k]) l = k + 1; + else h = k; + } + a[i] = x1 + l; + } + free(zmap); +} + +//----------------------------------------------- Benchmark ------------------------------------------------------------------- +struct libss { int id; char *s;int size; }; struct libss libss[] = { - { P_CPY, "copy", }, - { P_VB, "TurboVbyte" }, - { P_VBL, "Vbyte FPF" }, - { P_VG8, "vg8iu" }, + { P_CPY, "Copy", 0 }, + //---------------- Variable byte --------------------------------- + { P_VB, "TurboVbyte", 0 }, + { P_VBL, "VbyteFPF", 0 }, + { P_VG8, "VarintG8IU",0 }, +//{ P_VBP, "VBytePoly" }, + // -------------- Simple family ---------------------------------- + { P_SV, "SimpleV", 0 }, + #ifdef USE_SIMPLE_8B + { P_S64, "Simple-8b",0 }, //crash on 32 bits? + #endif + #ifdef USE_SIMPLE16 + { P_S16, "Simple16", 0 }, //max. 28 bits + #endif + //--------------- PFor ------------------------------------------ + #ifndef _WIN32 + { P_P4DR, "TurboPForDA", 128 }, // actually not working w. mingw + #endif + { P_P4D, "TurboPFor", 128 }, - { P_SV, "simpleV" }, - { P_S8BO, "simple 8b" }, - { P_S16, "simple16" }, + #ifdef USE_OPTPFD + { P_OPTP4, "OptPFD", 128 }, //max. 28 bits + #endif + //-------------- Bit Packing ------------------------------ + { P_PCK, "TurboPack", PACK_SIZE }, + { P_PCKR, "TurboPackDA", PACK_SIZE }, + { P_SIMDH, "SIMDPackFPF", 128 }, - { P_P4DR, "TurboPFor DA" }, - { P_P4D, "TurboPFor" }, - { P_OPTP4, "OptP4" }, - - { P_PCK, "TurboPack" }, - { P_PCKR, "TurboPack DA" }, - { P_SIMDH, "SIMDBitPack FPF" }, { -1, "" }, }; -//--------------------------------------------------------------------------------------------- -#define MAXT 8 -#define BLK_SIZE 129 -#define MB (1024*1024) +#define MB 1000000 +int verb = 0, reps = 1<<24, trips = 1, xcheck=1; +unsigned xbits[33]; +enum { T_DUP, T_UNI, T_TXT, T_BYTE, T_TST }; -int verb = 0, reps = 100000, trips = 3; -enum { T_ZIPF=1, T_ID }; - -struct libs { int id,err; char *s,*v; unsigned long long l; double tc,td; }; +struct libs { int id,err,size; char *s,*v; unsigned long long l, c[33]; double tc,td; }; struct libs libs[64]; +int libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; } int l_cmp(struct libs *a, struct libs *b) { if(a->l < b->l || a->l == b->l && a->td < b->td) return -1; @@ -349,149 +379,150 @@ void check(unsigned *in, unsigned n, unsigned *out, char *s) { } } -void print(unsigned long long n, char *s) { +void stprint() { + int m; + unsigned long long t=0; + for(m = 0; m < 33; m++) + t += xbits[m]; + printf("\ndistribution:"); + for(m = 0; m < 33; m++) + if(xbits[m]) printf("%d:%.2f%% ", m, (double)xbits[m]*100/t); printf("\n"); +} + +void print(unsigned long long n, char *s, unsigned long long *u) { int m, k; for(k = 0; libs[k].id >= 0; k++); - qsort(libs, k, sizeof(libs[0]), l_cmp); - + qsort(libs, k, sizeof(libs[0]), (int(*)(const void*,const void*))l_cmp); + char *prtname = s?s:""; { unsigned char *p; if((p = strrchr(prtname, '/')) || (p = strrchr(prtname, '\\'))) prtname = p+1;} for(m = 0; m < k; m++) if(libs[m].l) { struct libs *lb = &libs[m]; - printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", s, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n, + printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", prtname, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n, lb->tc>=0.000001?((double)n/1000000.0) / (lb->tc/TM_T):0.0, lb->td>=0.000001?((double)n/1000000.0) / (lb->td/TM_T):0.0, lb->s ); + if(u && verb>3) { printf("\n");for(k = 0; k < 33; k++) if(u[k]) printf("%d:%.1f\t", k, (double)lb->c[k]*100/u[k]); printf("\n"); } } } -//int libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; } - -unsigned bench(unsigned *__restrict__ _in, unsigned _inlen, int blksize, unsigned char *__restrict__ _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict__ cpy, int bb) { int m,id,b=bb,i; if(verb) { printf(":%d,", _inlen); fflush(stdout);} - unsigned cn; tm_t tt0 = tminit(); +unsigned bench(unsigned *__restrict _in, unsigned _inlen, int blksize, unsigned char *__restrict _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict cpy, int bb, int mode ) { int m,id,b=bb,i; + if(!_inlen) return 0; if(verb>1) { printf(":%d,", _inlen); fflush(stdout); } + unsigned cn; tm_t tt0 = tminit(); for(i = 0; i < 10; i++) memcpy(_out, _in, _inlen); - for(m = 0; (id=libs[m].id) >= 0; m++) { int r,insize=(id==P_OPTP4)?blksize-1:blksize; - struct libs *lb = &libs[m]; unsigned cl; if(verb) { printf("%s", libs[m].s);fflush(stdout); } int t,tj; tm_t t0,tc=TM_TMAX,td=TM_TMAX,tt; - for(t = 0; t < trips; t++) { t0 = tminit(); + for(m = 0; (id=libs[m].id) >= 0; m++) { + blksize = libs[m].size?libs[m].size:blksize; + int r,insize=(mode>=0)?blksize+1:blksize; + struct libs *lb = &libs[m]; + unsigned cl,cc[33]; if(verb) printf("%s,%d", libs[m].s, blksize); + int t,tj; tm_t t0,tc=TM_TMAX,td=TM_TMAX,tt; + for(t = 0; t < trips; t++) { t0 = tminit(); for(r = 0; r < reps; ) { - cn=cl=0; - unsigned *in; - unsigned char *out,*sout; //vsini(); + for(cl=0; cl<33; cl++) cc[cl]=0; cn=cl=0; + unsigned *in; + unsigned char *out; for(out = _out, in = _in; in < _in+_inlen; ) { unsigned n,inlen = *in++,*ip=in; in += inlen; - *(unsigned *)out = inlen; out+=4;/*out++=0x5a;*/ - for(;ip < in; ip += n) { n = ip+insize<=in?insize:in-ip; cn += n; unsigned char *sout=out; //printf("%d ", n); - out = beenc(ip,n,out,id,bb); - cl +=out-sout; - } if(out > _out+outsize) { fprintf(stderr, "Overflow error %lld, %lld in %s\n", outsize, (ptrdiff_t)(out - _out), lb->s); exit(0); } - } r++; if((tt = tmtime() - t0) > tx) break; - } if(tt < tc) { tc = tt; tj = r; } - if(tmtime() - tt0 > tx*trips) { /*printf("#");fflush(stdout);*/ /*sleep(1);*/tt0 = tminit(); } + *(unsigned *)out = inlen; out += 4; unsigned char *sout = out; + for(;ip < in; ip += n) { n = ip+insize <= in?insize:in-ip; cn += n; if(out+5*n > _out+outsize) die("Overflow error %llu, %u in %s\n", outsize, (int)(ptrdiff_t)(out - _out), lb->s); + out = mode >= 0?besenc(ip, n, out, id, mode):beenc(ip, n, out, id, bb); + } + cl += out - sout; cc[bsr32(inlen)] += out - sout; + } + r++; if((tt = tmtime() - t0) > tx) break; + } if(tt < tc) tc = tt, tj = r; //if(tmtime() - tt0 > tx*trips) { sleelp(5); tt0 = tminit(); } } - lb->l += cl; lb->tc += tc/tj; memset(cpy, 0xf, _inlen*4); if(verb) { printf("+ ");fflush(stdout);} - tt0 = tminit(); - for(t = 0; t < trips; t++) { t0 = tminit(); - for(r = 0; r < reps; ) { unsigned *out; unsigned char *in; + + for(t=0; t < 33; ++t) lb->c[t] += cc[t]; + + lb->l += cl; lb->tc += (double)tc/tj; memset(cpy, 0xf, _inlen*4); if(verb) { printf("/");fflush(stdout);} + tt0 = tminit(); + for(t = 0; t < trips; t++) { t0 = tminit(); + for(r = 0; r < reps; ) { + unsigned *out; unsigned char *in; for(out = cpy, in = _out; out < cpy+_inlen;) { unsigned n,*op, outlen=*(unsigned *)in; in+=4; *out++ = outlen; for(op=out,out += outlen; op < out; op += n) { n = op + insize<=out?insize:out-op; - in = bedec(in,n,op,id,bb); + in = mode>=0?besdec(in,n,op,id, mode):bedec(in,n,op,id,bb); } - } - r++; - if((tt = tmtime() - t0) > tx) - break; - } - if(tt < td) { - td = tt; - tj = r; - } - if(tmtime() - tt0 > tx*trips) { - tt0 = tminit(); - } - } lb->td += td/tj; - check(_in, _inlen, cpy, lb->s); + } + r++; if((tt = tmtime() - t0) > tx) break; + } if(tt < td) td = tt, tj = r; + //if(tmtime() - tt0 > tx*trips) tt0 = tminit(); + } + lb->td += (double)td/tj; + if(xcheck) check(_in, _inlen, cpy, lb->s); } return cn; } -int z_cmp(double **a, double **b) { - if(*a < *b) return -1; - if(*a > *b) return 1; - return 0; -} - -void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) { - int i,m = x2 - x1 + 1; - double prob, cum, *zmap; - if(!(zmap = malloc(m*sizeof(zmap[0])))) { - fprintf(stderr, "mallo error\n"); - exit(-1); - }; - - srand48(1); - for(cum =0.0,i = 0; i < m; i++) - cum += 1.0 / pow(i+1, alpha); - cum = 1.0 / cum; - for(prob=0.0,i = 0; i < m; i++) - zmap[i] = prob += cum / pow(i+1, alpha); - qsort(zmap, m, sizeof(zmap[0]), (int(*)(const void*,const void*))z_cmp); - - for(i = 0; i < n; i++) { - double r = drand48(); - int l = 0, h = m-1; - while(l < h) { - int k = (l + h) >> 1; - if(r > zmap[k]) l = k + 1; - else h = k; - } - a[i] = x1 + l; - } - free(zmap); +void usage() { + fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); + fprintf(stderr, "Usage: icbench [options] [file]\n"); + fprintf(stderr, "Use zipfian generator when no file specified\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -bNm N = blocksize (default 128) m=k kilobyte ex. -b64k\n"); + fprintf(stderr, " -cN N = format ordered(0:delta+0,1:delta+1),2=convert text to integer format\n"); + fprintf(stderr, " -eS N = encoder scheme (default all)\n"); + fprintf(stderr, " -tN N = time in seconds per interation\n"); + fprintf(stderr, " -TN N = Iterations (default 3)\n"); + fprintf(stderr, " -vN N = verbosity 1..3\n"); + fprintf(stderr, "----- file specified --------------\n"); + fprintf(stderr, " -rN N = max. file size to read\n"); + fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted\n"); + fprintf(stderr, "----- file not specified --------------\n"); + fprintf(stderr, " -aF F = zipfian distribution alpha ex. -a1.0 uniform -a1.5 skewed\n"); + fprintf(stderr, " -mN N = minimum integer generated in bits\n"); + fprintf(stderr, " -MN N = maximum integer generated in bits\n"); + fprintf(stderr, " -nN N = number of integers to generate\n"); + fprintf(stderr, "Ex. ./icbench -a1.0 -m0 -x8 -n100000000\n"); + exit(0); } #define OVD (10*MB) -int main(int argc, char *argv[]) { - char fname[0x100], *cmd=NULL; - unsigned bp=0,ftype = T_ID, rm=0,rx=30,n=10000000; - long long rdmax = 1<<30; tm_t tx=1*1000000; +int main(int argc, char *argv[]) { int r; + char fname[0x100], *cmd=NULL; + unsigned xbp=0, rm=0,rx=30,n=0; + int mode = -1; + long long rdmax = 1ull<<32; double a = 1.5; - + tm_t tx=1*1000000; + unsigned blksize = PACK_SIZE; tminit(); - VarIntG8IU(); - - int c, digit_optind = 0; - int this_option_optind = optind ? optind : 1, option_index = 0; + VarIntG8IU(); + int c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0; static struct option long_options[] = { {"repeat", 0, 0, 'r'}, {0,0, 0, 0} }; for(;;) { - if((c = getopt_long(argc, argv, "Ac:TBR:ys:r:n:b:c:e:t:r:M:v:m:x:a:", long_options, &option_index)) == -1) break; + if((c = getopt_long(argc, argv, "BshHa:b:c:e:f:m:n:r:R:T:v:M:", long_options, &option_index)) == -1) break; switch(c) { - case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; - case 'r': reps = atoi(optarg); break; - case 'R': trips = atoi(optarg); break; - case 'v': verb = atoi(optarg);verb++; break; - case 't': tx = atoi(optarg)*1000000; break; - case 'c': ftype = atoi(optarg); break; - case 'b': rdmax = atoi(optarg)*MB; break; - case 'e': cmd=optarg; break; - case 'm': rm = atoi(optarg); break; - case 'x': rx = atoi(optarg); break; // - case 'B': bp++; break; - case 'n': n = atoi(optarg); break; - case 'a': a = strtod(optarg, NULL); break; - default: fprintf(stdout,"unknown option: %c \n", optopt); exit(1); + case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + case 'a': a = strtod(optarg, NULL); break; + case 'b': { char *p; blksize = strtol(optarg, &p, 10); if(*p == 'k' || *p == 'K') blksize *= 1024; if(blksize>BLK_SIZE) blksize = BLK_SIZE; } break; + case 'c': mode = atoi(optarg); break; + case 'f': rdmax = atoi(optarg)*MB; break; + case 'h': usage(); break; + case 'H': xcheck=0; break; + case 'e': cmd = optarg; break; + case 'm': rm = atoi(optarg); break; + case 'n': { char *p; n = strtol(optarg, &p, 10); if(*p == 'k' || *p == 'K') n *= 1000; else if(*p == 'b' || *p == 'B') n *= 1000000000; else n *= 1000000; } break; + case 'r': reps = atoi(optarg); break; + case 'R': trips = atoi(optarg); break; + case 't': tx = atoi(optarg)*1000000; break; + case 'v': verb = atoi(optarg); break; + case 'M': rx = atoi(optarg); break; + default: usage(); } } - int fno,i=0; //libini(); - if(!bp) { rm = (1< n) rx = n; } else if(!rm) rm = 1; - //printf("range=(%d,%d,%d)\n", rm, rx, n);fflush(stdout); + int fno,i=0; + if(!xbp) { rm = (1< n) rx = n; } else if(!rm) rm = 1; //printf("range=(min=%u, max=%u)\n", rm, rx);fflush(stdout); + // build the test functions set struct libss *ls; - if(cmd) { - unsigned char *q=NULL; + if(cmd) { + char *q = NULL; for(i=0,libs[0].id = -1;;) { if(cmd) { - if(!*cmd) break; //printf("cmd='%s'", cmd); + if(!*cmd) break; q = strchr(cmd,','); if(q) *q=' '; if(q = strchr(cmd,'/')) @@ -499,34 +530,32 @@ int main(int argc, char *argv[]) { for(ls = libss; ls->id >= 0; ls++) if(!strcasecmp(ls->s, cmd)) { memset(&libs[i], 0, sizeof(struct libs)); - libs[i].id = ls->id; - libs[i].err = 0; - libs[i].s = ls->s; - libs[i++].v = ls->v; + libs[i].id = ls->id; + libs[i].err = 0; + libs[i].s = ls->s; + libs[i++].size = ls->size; if(verb) printf("%s/", ls->s);fflush(stdout); break; } - if(ls->id < 0) { - printf("library: '%s' not found\n", cmd); - exit(-1); - } + if(ls->id < 0) die("library: '%s' not found\n", cmd); cmd = q?(q+1):""; } } } else for(ls = libss; ls->id >= 0; ls++) { - libs[i].id = ls->id; - libs[i].err = 0; - libs[i].s = ls->s; //printf("%s\n", ls->s);fflush(stdout); - libs[i++].v = ls->v; + libs[i].id = ls->id; + libs[i].err = 0; + libs[i].s = ls->s; if(verb) printf("%s/", ls->s);fflush(stdout); + libs[i++].size = ls->size; } - libs[i].id = -1; - - if(argc <= optind) { - unsigned *in, *out, *cpy,*ip; unsigned long long totlen=0; - in = malloc(n*4+OVD); if(!in) { printf("malloc err=%u", n); exit(0); } - out = malloc(n*4+OVD); if(!out) { printf("malloc err=%u", n); exit(0); } - cpy = malloc(n*4+OVD); if(!cpy) { printf("malloc err=%u", n); exit(0); } + libs[i].id = -1; if(verb) printf("\n"); + + if(argc <= optind) { // No file specified + if(!n) n = 100000000; if(rx > n) rx = n; + unsigned *in, *cpy,*ip; unsigned char *out; unsigned long long totlen=0; + in = malloc(n*4+OVD); if(!in) die("malloc err=%u", n); + out = malloc(n*4+OVD); if(!out) die("malloc err=%u", n); + cpy = malloc(n*4+OVD); if(!cpy) die("malloc err=%u", n); char s[33]; s[0]=0; - if(bp) { + if(mode == T_TST) { // Unit test for fixed bit sizes int b; printf("bittest\n"); fflush(stdout); for(b = rm; b <= rx; b++) { @@ -534,84 +563,100 @@ int main(int argc, char *argv[]) { *in = n; for(i = 1; i <= n; i++) in[i] = (1ull << b)-1; - totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, b); - print(totlen, s); + totlen = bench(in+1, n, blksize, out, n*4+OVD, s, tx, cpy, b, mode); + print(totlen, s, NULL); } - } else { - printf("zipf a=%3.1f [%u,%u]\n", a, rm, rx); + } else { // Benchmark w. generated data + printf("zipf alpha=%3.1f range[%u..%u].\nbit size histogramm: ", a, rm, rx); *in = n; - zipfgen(in+1, a, rm, rx, n); //stprint(); - totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, -1); - print(totlen, s); + zipfgen(in+1, a, rm, rx, n); for(i = 1; i <= n; i++) xbits[bsr32(in[i])]++; stprint(); + if(mode>=0) { unsigned *ip=in+1; int v; for(v = 1; v < n; v++) { ip[v] += ip[v-1] + mode; if(ip[v]>(1u<<28)) die("overflow generating sorted array\n" ); } } + totlen = bench(in, n+1, blksize, out, n*4+OVD, s, tx, cpy, -1, mode); + print(totlen, s, NULL); } free(in); free(cpy); free(out); - } else for(fno = optind; fno < argc; fno++) { + } else for(fno = optind; fno < argc; fno++) { // Benchmark w. specified data files + libini(); char *inname = argv[fno]; - FILE *fi = fopen64(inname, "r"); - if(!fi) { - fprintf(stderr, "open error '%s'", inname); perror(inname); - exit(-1); - } - fseek(fi, 0, SEEK_END); - unsigned long long fisize = ftell(fi); - fseek(fi, 0, SEEK_SET); - if(fisize > rdmax) - fisize = rdmax; - fisize /= 4; //setvbuf(fi, NULL, _IOFBF, 1000*MB); - unsigned *in, *out, *cpy,*ip; - unsigned long long totlen=0; - int rc; - out = malloc(fisize*4+OVD); if(!out) { printf("malloc err=%u", fisize); exit(0); } - cpy = malloc(fisize*4+OVD); if(!cpy) { printf("malloc err=%u", fisize); exit(0); } - in = malloc(fisize*4+1024); if(!in) { printf("malloc err=%u", fisize); exit(0); } PGM_FD(fileno(fi)); - int r; fread(&r, 4, 1, fi); - while(r > 0) { - for(ip = in; ip+r <= in+fisize;) { - int rc; PGM_FDPUT(fileno(fi)); - if((rc = fread(ip+1, 4, r, fi)) <= 0) - goto a; - - if(r >= rm && r <= rx) { - *ip++ = r; - int j; - if(verb) - printf("%d ", r, ftype==T_ID?"I":"N"); - fflush(stdout); - if(ftype == T_ID) { - for(j = 0; j < r; ) { - unsigned m = j+BLK_SIZE>r?r-j:BLK_SIZE; - int i,did,dido = -1; - for(i = 0; i < m; i++) { - did = ip[i]; - if(did < dido) { - printf("IDs in '%s' not sorted.did=%d,dido=%d ", inname, did, dido); - exit(0); - } - ip[i] = did - dido - 1; - dido = /*ip[0]*/did; //printf("%d,", ip[i]); xbits[bsr32(ip[i])]++; - } - j += m; ip += m; //printf("\r"); - } - } else - ip += r; + if(mode == T_TXT || mode == T_BYTE) { //------------ convert text file to integer array format + FILE *fi = fopen(inname, "r"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } + char outname[257]; strcpy(outname, inname); strcat(outname, ".dat"); + FILE *fo = fopen(outname, "wb"); if(!fo) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } + #define LSIZE 16 + char s[LSIZE+1]; + unsigned num = 0; + fwrite(&num, 1, 4, fo); + if(mode == T_TXT) { + while(fgets(s, LSIZE, fi)) { + s[strlen(s) - 1] = 0; + unsigned i = strtoul(s, NULL, 10); + fwrite(&i, 1, 4, fo); + num++; + } + } else { + unsigned u; + unsigned char c; + while(fread(&c, 1, 1, fi)>0){ + u = c; + fwrite(&u, 1, 4, fo); + num++; } - r = rc = 0; - if(ftype == T_ID) - rc = fread(&r, 4, 1, fi); - if(rc <= 0 || !r) - break; } - totlen += bench(in, ip-in, BLK_SIZE, out, fisize*4+OVD, inname, tx, cpy, -1); - if(totlen > n) - break; - } - a:fclose(fi); //stprint(); - print(totlen,inname); - free(in); - free(cpy); - free(out); - } -} + fseeko(fo, 0, SEEK_SET); + fwrite(&num, 1, 4, fo); printf("num=%u\n", num); + fclose(fo); + fclose(fi); + continue; + } + // process integer array file + FILE *fi = fopen64(inname, "rb"); + if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } + fseeko(fi, 0, SEEK_END); + unsigned long long fisize = ftello(fi); fseeko(fi, 0, SEEK_SET); //printf("fisize=%llu\n", fisize); + if(fisize > rdmax) fisize = rdmax; + fisize /= 4; + + unsigned *in, *cpy,*ip,num; unsigned char *out; + unsigned long long outsize=fisize*5+OVD,totlen=0,bitslen[33]={0}; + out = malloc(outsize); if(!out) die("malloc err=%llu", fisize); + cpy = malloc(fisize*4+OVD); if(!cpy) die("malloc err=%llu", fisize); + in = malloc(fisize*4+1024); if(!in) die("malloc err=%llu", fisize); + + ip = in; + while(fread(&num, 1, 4, fi) == 4 && num) { //printf("?");fflush(stdout); + if(num < rm || num > rx) { fseeko(fi, num*4, SEEK_CUR); continue; } + if(ip+num > in+fisize) { + totlen += bench(in, ip-in, blksize, out, outsize, inname, tx, cpy, -1, mode); printf("#%u", (unsigned)(totlen/1000000));fflush(stdout); + if(n && totlen > n) + break; + ip = in; + } + *ip++ = num; if(fread(ip, 4, num, fi) != num) break; + bitslen[bsr32(num)] += num*4; + #ifdef STATS + unsigned *ep = ip+num,insize=(mode>=0)?blksize+1:blksize; + while(ip < ep) { + unsigned m = min(ep-ip, insize),i; + if(mode >= 0) { + for(i = 1; i < m; i++) { + if(verb>3) printf(":%u ", ip[i]);fflush(stdout); + xbits[bsr32((ip[i] - ip[i-1]) - mode)]++; + if(ip[i] < ip[i-1]+mode) die("IDs in '%s' not sorted.[did=%u,%u] at line=%d\n", inname, ip[i], ip[i-1], (int)(ip-in)); + } + } else for(i = 0; i < m; i++) xbits[bsr32(ip[i])]++; + ip += m; + } + #else + ip += num; + #endif + } + a:fclose(fi); + totlen += bench(in, ip-in, blksize, out, outsize, inname, tx, cpy, -1, mode); + printf("#%u", (unsigned)(totlen/1000000)); + free(in); free(cpy); free(out); + stprint(); print(totlen,inname, bitslen); + } +} diff --git a/idx.h b/idx.h new file mode 100644 index 0000000..18282f3 --- /dev/null +++ b/idx.h @@ -0,0 +1,20 @@ +#include + +#define BLK_DIDNUM (128+1) // // Block size 128 + 1 (1 stored in skips) + + +// Compression method. Set only one METHOD! + // compressed size for 62 GB clueweb09.sorted + // Defaut is bitpack/bitunpack 18 GB +#define USE_SIMDPACK // SIMD Bitpacking 18 GB +//#define USE_TURBOPFOR // for compact version 12 GB +//#define USE_TURBOPACKD + +//-------------------------- Mapping term id <-> posting offset in file ---------------------------------- +typedef struct { uint8_t offseth; uint32_t offsetl; } __attribute__ ((packed)) tmap_t; // 40 bits offsets -> 1 Terabyte + +#define TIDMAPSET(__t, __ofs) { (__t)->offseth = (__ofs)>>32; (__t)->offsetl = (__ofs) & 0xffffffff; } +#define TIDMAPGET(__t) ((__off64_t)(__t)->offseth << 32 | (__t)->offsetl) +#define TIDMAP(__fdm, __tid) ({ char *_bp = __fdm; tmap_t *_t = (tmap_t *)&_bp[(__tid)*sizeof(tmap_t)]; TIDMAPGET(_t); }) +//-------------------------------------------------------------------------------------------------------- + diff --git a/idxcr.c b/idxcr.c new file mode 100644 index 0000000..37815f6 --- /dev/null +++ b/idxcr.c @@ -0,0 +1,153 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo [AT] gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + idxcr.c - "Integer Compression" Create inverted index for using by idxqry for benchmarking +**/ +#define _LARGEFILE64_SOURCE 1 +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "vint.h" +#include "vp4dc.h" + +#include "bitpack.h" +#include "idx.h" +//-------------------------------------- Simdcomp -------------------------------------------------------------------------- +#include "ext/simdcomp/include/simdbitpacking.h" + +unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b); + return (unsigned char *)out; +} +unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); + return (unsigned char *)out; +} +//--------------------------------------------------------------------------------------------------------------- +#define DELTA( __in, __n, __b) { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); } + +#define TERMNUM 2000000 +int verb; + +void usage() { + fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); + fprintf(stderr, "https://github.com/powturbo/TurboPFor\n\n"); + fprintf(stderr, "Create inverted index from 'Document identifier data set' format\n"); + fprintf(stderr, "See http://lemire.me/data/integercompression2014.html'\n"); + fprintf(stderr, "Usage: idxcr \n"); + fprintf(stderr, "ex. idxcr clueweb09.sorted idxdir\n\n"); + exit(-1); +} + +int main(int argc, char *argv[]) { + int fno,c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0; unsigned char *path=""; + static struct option long_options[] = { {"r", 0, 0, 'r'}, {0,0, 0, 0} }; + for(;;) { + if((c = getopt_long(argc, argv, "xv:", long_options, &option_index)) == -1) break; + switch(c) { + case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + case 'v': verb = atoi(optarg); break; + default: die("unknown option: %c \n", optopt); + } + } + if(argc - optind < 2) usage(); + tmap_t *tmap = malloc(TERMNUM*sizeof(tmap_t)); if(!tmap) die("malloc error\n"); + path = argv[--argc]; + + for(fno = optind; fno < argc; fno++) { + char outname[257], *inname = argv[fno]; + strcpy(outname, path); + unsigned char *p = strrchr(inname,'/'); + if(!p) p = strrchr(inname,'\\'); if(!p) p=inname; + strcat(outname, p); strcat(outname,".i"); + + FILE *fi = fopen64(inname, "rb"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } int fdi = fileno(fi); + FILE *fo = fopen64(outname,"wb"),*fm; if(!fo) { fprintf(stderr, "creat error '%s'", outname); perror(outname); exit(-1); } fprintf(stderr, "file='%s'", outname); + fseeko(fo, sizeof(unsigned)+sizeof(unsigned long long), SEEK_SET); + + unsigned *in = NULL,*ip,*ep,num,tid=0,numx=0,outsize; + unsigned char *out = NULL; + unsigned long long fofs; + + while(fread(&num, 1, 4, fi) == 4 && num) { // read number of docid in term + unsigned bnum = (num+BLK_DIDNUM-1)/BLK_DIDNUM; + if(num > numx) { numx = num; + in = realloc(in, num*4+64); + outsize = num*4+bnum*sizeof(unsigned)*2+1024; + out = realloc(out, outsize); + if(!in || !out) die("malloc err=%u", num); + } + + if(fread(in, 4, num, fi) != num) break; // read docid list + unsigned char *op = out,*_op; + vbput(op, num); // store f_t + + unsigned *pix = (unsigned *)op; + if(num > BLK_DIDNUM) op += bnum*sizeof(unsigned)*2; + for(_op = op, ip = in, ep = ip+num; ip < ep; ) { + if(num > BLK_DIDNUM) { // skip/index. docid[0] and offset to compressed block + *pix = ip[0]; // First docid + pix[bnum] = op-_op; // offset + pix++; + } else vbput(op, ip[0]); // skip not needed + + unsigned n = min(ep-ip, BLK_DIDNUM),b=0; if(op+5*n > out+outsize) die("output buffer too small\n"); + if(n > 1) { + DELTA(ip, n, b); + #ifdef USE_SIMDPACK + if(n < 129) { *op++ = b; op = bitpack32( ip+1, n-1, b, op); } //op = vbenc(ip+1, n-1, op); + else { *op++ = b; op = simdpackwn(ip+1, n-1, b, (unsigned *)op); } + #elif defined(USE_TURBOPFOR) + op = p4denc32( ip+1, n-1, op); + #else + *op++ = b; op = bitpack32(ip+1, n-1, b, op); + #endif + } + ip += n; + } + fofs = ftello(fo); + tmap_t *t = &tmap[tid++]; + TIDMAPSET(t, fofs); + if(fwrite(out, 1, op-out, fo) < 0) die("fwrite error\n"); + } + fofs = ftello(fo); // write termmap + if(fwrite(tmap, 1, tid*sizeof(tmap_t), fo) < 0) die("fwrite error\n"); + + fseeko(fo, 0, SEEK_SET); + if(fwrite(&fofs, 1, sizeof(unsigned long long), fo) < 0) die("fwrite error\n"); + if(fwrite(&tid, 1, sizeof(unsigned), fo) < 0) die("fwrite error\n"); + + fclose(fi); fclose(fo); + if(in) { free(in); free(out); } + } +} diff --git a/idxqry.c b/idxqry.c new file mode 100644 index 0000000..06365a7 --- /dev/null +++ b/idxqry.c @@ -0,0 +1,364 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo [AT] gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo +**/ +#define _LARGEFILE64_SOURCE 1 +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include + #ifndef _WIN32 +#include +#include +#include +#include + #endif +#include + +#include "conf.h" +#include "vint.h" +#include "bitunpack.h" +#include "vp4dd.h" +#include "idx.h" + +#define STATS +//---------------------------------------- Time --------------------------------------------------------------------- +typedef unsigned long long tm_t; +#define TM_TMAX (1ull<<63) + +#include +#define TM_T 1000000.0 +static tm_t tmtime(void) { struct timeval tm; gettimeofday(&tm, NULL); return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; } +static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } +static double tmsec( tm_t tm) { return (double)tm/1000000.0; } +static double tmmsec(tm_t tm) { return (double)tm/1000.0; } + +//--------------------------------------- Simdcomp ------------------------------------------------------------------- +#include "ext/simdcomp/include/simdbitpacking.h" +unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { + uint32_t k, *out_; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack((const __m128i *)in, out, b); + return (unsigned char *)in; +} +unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { + uint32_t k, *out_; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, in, out, b); + return (unsigned char *)in; +} + +//------------------------------------- index file (created by idxcr) ------------------------------------------------------------- +typedef struct { // Index + unsigned char *fdp, // posting + *fdm; // mapping term id to offset in posting + unsigned long long fdsize; + unsigned tnum; +} idxrd_t; + +int idxopen(idxrd_t *idx, char *s) { + int fd; char *p; + if((fd = open(s, O_RDONLY| O_LARGEFILE)) < 0) + die("can't open index file '%s' rc=%d:%s\n", s, errno, strerror(errno)); + struct stat sbuf; // Memory mapped access + fstat(fd, &sbuf); + if(sbuf.st_size > 0 && (p = mmap( NULL, sbuf.st_size , PROT_READ, MAP_SHARED|MAP_NORESERVE, fd, 0)) == (void *)-1) + die("mmap errno=%d,'%s'\n", errno, strerror(errno) ); + close(fd); + + idx->fdsize = sbuf.st_size; + idx->fdp = p; + idx->fdm = p + *(uint64_t *)p; p += sizeof(uint64_t); // Termid map table. Termid->Posting + idx->tnum = *(unsigned *)p; + return 0; +} + +int idxclose(idxrd_t *idx) { + munmap(idx->fdp, idx->fdsize); +} + +//--------------------------------- Posting -------------------------------------------------------------- +#ifdef STATS +unsigned long long st_tot,st_dec; +#define STATINI st_tot=st_dec=0 +#define STAT(a) a +#else +#define STATINI +#define STAT(a) +#endif + +typedef struct { + unsigned char *bp,*p; + unsigned f_t,_f_t, did,ldid; + int didno,didnum, bno, bnum; +} post_t; + +// Init posting for term id tid +int postinit( post_t *post, int tid, idxrd_t *idx, unsigned *dids) { + unsigned long long o = TIDMAP(idx->fdm, tid); if(!o) return 0; + unsigned char *p = idx->fdp + o; // start of posting; + post->f_t = vbget(p); // num docs + post->bnum = (post->f_t+BLK_DIDNUM-1)/BLK_DIDNUM; // num blocks + post->_f_t = post->f_t; + post->didno = post->bno = -1; + post->bp = p; // start skip block + post->p = p + post->bnum*sizeof(unsigned)*2; // start posting block + dids[0] = INT_MAX; + post->ldid = 0; post->did = -1; + post->didnum = min(post->f_t,BLK_DIDNUM); STAT(st_tot += post->f_t); + if(post->f_t <= BLK_DIDNUM) post->bno=post->bnum; + return post->f_t; +} + +// Get next docid. Return value >= INT_MAX at end of posting +static inline ALWAYS_INLINE unsigned postnext(post_t *post, unsigned *dids) { + if((post->did += dids[++post->didno] + 1) < INT_MAX) return post->did; + + unsigned char *p = post->bp; + if(post->f_t > BLK_DIDNUM) { + if(++post->bno >= post->bnum) return INT_MAX; + unsigned *pix = (unsigned *)p + post->bno; + dids[0] = *pix; // first did in block + p = post->p + pix[post->bnum]; // o=offset to posting block + } else dids[0] = vbget(p); + + post->didnum = min(post->_f_t, BLK_DIDNUM); + post->_f_t -= post->didnum; //STAT(st_dec+=post->didnum); + if(post->didnum > 1) { + #if defined(USE_SIMDPACK) + unsigned b = *p++; + if(post->didnum < 129) p = bitunpack32(p, post->didnum-1, b, &dids[1]); //p = vbdec(p, post->didnum-1, &dids[1]); + else { p = simdunpackn( (unsigned *)p, post->didnum-1, b, &dids[1]); } + #elif defined(USE_TURBOPFOR) + p = p4ddec32( p, post->didnum-1, &dids[1]); + #else + unsigned b = *p++; p = bitunpack32(p, post->didnum-1, b, &dids[1]); + #endif + } + dids[post->didnum] = INT_MAX; + post->didno = 0; + return post->did = dids[0]; +} + +// Get next docid equal or greater than the parameter did +static inline ALWAYS_INLINE unsigned postget(post_t *post, unsigned did, unsigned *dids) { + if(did < post->ldid) { // pending dids + for(;;) { + if(post->did >= did) break; post->did += dids[++post->didno]+1; + if(post->did >= did) break; post->did += dids[++post->didno]+1; + if(post->did >= did) break; post->did += dids[++post->didno]+1; + if(post->did >= did) break; post->did += dids[++post->didno]+1; + } + if(post->did < INT_MAX) return post->did; + } + + unsigned char *p = post->bp; //Skip index + if(post->f_t > BLK_DIDNUM) { + unsigned *_q = (unsigned *)p,*q=_q+(++post->bno),*qe=_q+post->bnum-1; + for(;;) { + if(q[1] >= did || q >= qe) break; q++; + if(q[1] >= did || q >= qe) break; q++; + if(q[1] >= did || q >= qe) break; q++; + if(q[1] >= did || q >= qe) break; q++; + } + post->bno = q - _q; + if(q < qe) { + if(did < _q[0]) { post->bno=-1;post->ldid = _q[0]; return _q[0]; } + post->ldid = q[1]; + } else { + post->ldid = INT_MAX; + post->didnum = post->f_t - post->bno*BLK_DIDNUM; + q = qe; + } + post->bno = q-_q; + dids[0] = post->did = *q; // first did in block + p = post->p+q[post->bnum]; // o=offset to posting block + } else { + post->ldid = INT_MAX; + dids[0] = post->did = vbget(p); + } + STAT(st_dec+=post->didnum); + if(post->didnum > 1) { + #if defined(USE_SIMDPACK) + unsigned b = *p++; + if(post->didnum < 129) p = bitunpack32(p, post->didnum-1, b, &dids[1]); //p = vbdec(p, post->didnum-1, &dids[1]); + else { p = simdunpackn( (unsigned *)p, post->didnum-1, b, &dids[1]); } + #elif defined(USE_TURBOPFOR) + p = p4ddec32( p, post->didnum-1, &dids[1]); + #else + unsigned b = *p++; p = bitunpack32(p, post->didnum-1, b, &dids[1]); + #endif + } + dids[post->didnum] = INT_MAX; + for(post->didno=0; ; ) { + if(post->did >= did) break; post->did += dids[++post->didno]+1; + if(post->did >= did) break; post->did += dids[++post->didno]+1; + if(post->did >= did) break; post->did += dids[++post->didno]+1; + if(post->did >= did) break; post->did += dids[++post->didno]+1; + } + return (post->did >= INT_MAX)?post->ldid:post->did; +} +//----------------------------------------- query search ------------------------------------------ +#define TERMNUM 32 + +typedef struct { + int term[TERMNUM], terms, id; +} qry_t; + +int postcmp(post_t *a, post_t *b) { + if(a->f_t < b->f_t) return -1; + if(a->f_t > b->f_t) return 1; + return 0; +} + +int intersec_max; + +unsigned idxsearch(idxrd_t *idx, qry_t *q) { + int f_t = 0, i; + post_t *p, *pe, post[TERMNUM]; + unsigned did, elim, dids[TERMNUM][BLK_DIDNUM+31]; + + if(q->terms == 1) { // 1 Term query + if(!(f_t = postinit(post, q->term[0], idx, dids[0]))) + return 0; + for(i = 0; i < min(f_t,intersec_max); i++) { + if((did = postnext(post, dids[0])) >= INT_MAX) break; + f_t++; + } + } else if(q->terms == 2) { // optimized 2 terms query + if(!postinit(&post[0], q->term[0], idx, dids[0]) || !postinit(&post[1], q->term[1], idx, dids[1])) + return 0; + if(post[1].f_t < post[0].f_t) { post_t t = post[0]; post[0] = post[1]; post[1] = t; } // swap + for(elim=did=0,f_t=0;;) { + if(unlikely((did = postget(&post[0], did, dids[0])) >= INT_MAX)) break; + if(( elim = postget(&post[1], did, dids[1])) == did) { + if(++f_t >= intersec_max) break; + did++; + continue; + } else if(elim >= INT_MAX) break; + did = elim; + } + } else { // multiple terms conjunctive query + pe = &post[q->terms]; + for(p = post; p < pe; p++) + if(!postinit(p, q->term[p-post], idx, dids[p-post])) return 0; + qsort(post, q->terms, sizeof(post[0]), (int(*)(const void*,const void*))postcmp); // sort by f_t + + for(did = 0;;did++) { + a:if(unlikely((did = postget(post, did, dids[0])) >= INT_MAX)) return f_t; + for(p = &post[1]; p < pe; p++) { + if((elim = postget(p, did, dids[p-post])) == did) continue; + if(elim >= INT_MAX) return f_t; + did = elim; + goto a; + } + if(++f_t >= intersec_max) break; + } + } + return f_t; +} + +//------------------------------ Test + Benchmark ---------------------------------------------------- +#define QRYLEN 255 +int qline, temin = 1,temax = TERMNUM,tex=0,qmax=1<<30; +unsigned long long qrybatch(idxrd_t *idx, char *fqname, int *qid) { + char s[QRYLEN+1],*p,*q; + int id=0; + unsigned long long f_t=0; + FILE *fq; + + if(!(fq = fopen(fqname, "r+"))) + die("can't open file '%s'\n", fqname); + + while(fgets(s, QRYLEN, fq)) { ++qline; + s[strlen(s)-1]=0; + qry_t qry; + for(qry.terms=0,p=s; *p && qry.terms < TERMNUM; ) { + while(*p && (*p < '0' || *p > '9')) p++; if(!*p) break; + q = p; while(*p >= '0' && *p <= '9') p++; + qry.term[qry.terms++] = strtol(q, NULL, 10); + } + if(qry.terms >= temin && qry.terms <= temax) { //int j; for(j=0;j < qry.terms;j++) { if(j) printf(" "); printf("%u", qry.term[j]); } printf(" %d \n", qry.terms); + qry.id = ++id; tex = max(qry.terms,tex); + f_t += idxsearch(idx, &qry); if(id >= qmax) break; + } + } + fclose(fq); + *qid = id; + return f_t; +} + +void usage() { + fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__); + fprintf(stderr, "https://github.com/powturbo/TurboPFor\n\n"); + fprintf(stderr, "Benchmark for intersections in inverted index\n\n"); + fprintf(stderr, "Usage: idxqry [options] \n"); + fprintf(stderr, "\n"); + fprintf(stderr, " -nN N = max. intersections/query. ex. -n1k=100.000 -n1m=1.000.000\n"); + fprintf(stderr, " -mN N = minimum query terms (default 1)\n"); + fprintf(stderr, " -MN N = maximum query terms (default 16)\n"); + fprintf(stderr, " -rN N = number of iterations (default 3)\n"); + fprintf(stderr, " -qN N = max. number of queries\n"); + fprintf(stderr, " index created by 'idxcr' program\n"); + fprintf(stderr, "Ex. idxqry -n100k -m2 clueweb.sorted.i aol.txt\n"); + fprintf(stderr, "Ex. idxqry gov2.sorted.i 1mq.txt\n"); + fprintf(stderr, "8-16 GB RAM recommended\n\n"); + exit(-1); +} + +int main(int argc, char **argv ) { + int reps = 3,i; + + int c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0; + static struct option long_options[] = { {"", 0, 0, 'r'}, {0,0, 0, 0} }; + for(;;) { + if((c = getopt_long(argc, argv, "n:m:M:q:r:", long_options, &option_index)) == -1) break; + switch(c) { + case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + case 'q': qmax = atoi(optarg); break; + case 'r': reps = atoi(optarg); break; + case 'm': temin = atoi(optarg); break; + case 'M': temax = atoi(optarg); break; + case 'n': { char *p; intersec_max = strtol(optarg, &p, 10); if(*p == 'k' || *p == 'K') intersec_max *= 1000; else if(*p == 'm' || *p == 'M') intersec_max *= 1000000; } break; + default: usage(); + } + } + if(argc <= optind) usage(); + if(intersec_max) printf("Max. Intersections/query=%d\n", intersec_max); + else intersec_max=1<<30; + + idxrd_t idx; + if(idxopen(&idx, argv[optind])) + die("can't open idx file '%s'\n", argv[optind]); + for(i=0; i < reps; i++) { STATINI; + int id; tm_t t0 = tminit(); + unsigned long long inum = qrybatch(&idx, argv[optind+1], &id ); tm_t t1 = tmtime()-t0; + printf("qry=%d/%.2fs. [%.1f q/s] [%.3f ms/q] %llu docs found\n", id, tmsec(t1), (double)id/tmsec(t1), tmmsec(t1)/(double)id, inum ); + if(i 30) sleep(20); + } + idxclose(&idx); + #ifdef STATS + if(st_tot) printf("Terms=[%d-%d] Integers: total=%llu decoded=%llu ratio=%.2f%%\n", temin, tex, st_tot, st_dec, (double)st_dec*100/(double)st_tot); + #endif +} + diff --git a/makefile b/makefile index 0488c6e..1b7b678 100644 --- a/makefile +++ b/makefile @@ -1,28 +1,47 @@ -# powturbo (c) Copyright 2007-2013 -CFLAGS=-ffast-math -fstrict-aliasing -march=native -w -fpermissive +# powturbo (c) Copyright 2007-2015 +CFLAGS=-ffast-math -DNDEBUG -fstrict-aliasing -m64 -march=native BIT=./ -all: icbench +all: icbench idxcr idxqry bitunpack.o: $(BIT)bitunpack.c $(BIT)bitunpack_.h $(BIT)bitunpack.h $(BIT)bitunpack64_.h - cc -O2 $(CFLAGS) -c $(BIT)bitunpack.c + gcc -O3 $(CFLAGS) -c $(BIT)bitunpack.c bitpack.o: $(BIT)bitpack.c $(BIT)bitpack_.h $(BIT)bitpack.h $(BIT)bitpack64_.h - cc -O2 $(CFLAGS) -c $(BIT)bitpack.c + gcc -O2 $(CFLAGS2) -c $(BIT)bitpack.c vp4dc.o: $(BIT)vp4dc.c - cc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c + gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dc.c -SIMDCOMPD=aux/simdcomp/ +vp4dd.o: $(BIT)vp4dd.c + gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dd.c + +SIMDCOMPD=ext/simdcomp/ SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o -varintg8iu.o: $(BIT)aux/varintg8iu.c $(BIT)aux/varintg8iu.h - cc -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)aux/varintg8iu.c +varintg8iu.o: $(BIT)ext/varintg8iu.c $(BIT)ext/varintg8iu.h + gcc -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)ext/varintg8iu.c -icbench: icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o varintg8iu.o vp4dd.o vp4dc.o $(SIMDCOMP) - cc -O3 icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o vp4dd.o vp4dc.o varintg8iu.o $(SIMDCOMP) -lm -o icbench $(LFLAGS) +icbench: icbench.o bitpack.o bitunpack.o vsimple.o vp4dd.o vp4dc.o varintg8iu.o ext/simple8b.o $(SIMDCOMP) + gcc -O3 icbench.o bitpack.o bitunpack.o vsimple.o vp4dd.o vp4dc.o varintg8iu.o ext/simple8b.o $(SIMDCOMP) -lm -o icbench $(LFLAGS) + +idxcr: idxcr.o bitpack.o $(SIMDCOMP) vp4dc.o vsimple.o + gcc -O3 idxcr.o bitpack.o $(SIMDCOMP) vp4dc.o vsimple.o -o idxcr $(LFLAGS) + +idxqry: idxqry.o bitunpack.o $(SIMDCOMP) vp4dd.o + gcc -O3 idxqry.o bitunpack.o $(SIMDCOMP) vp4dd.o -o idxqry $(LFLAGS) .c.o: - cc -O3 $(CFLAGS) $< -c -o $@ + gcc -O3 $(CFLAGS) $< -c -o $@ +clean: + rm *.o + rm ext/*.o + rm ext/simdcomp/*.o + rm ext/simdcomp/src/*.o +cleanw: + del .\*.o + del ext\*.o + del ext\simdcomp\*.o + del ext\simdcomp\src\*.o diff --git a/vint.h b/vint.h index 5169b5c..6e7f4ce 100644 --- a/vint.h +++ b/vint.h @@ -16,7 +16,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo @@ -27,44 +27,62 @@ #ifndef VINT_H #define VINT_H #include "conf.h" + +#ifdef __cplusplus +extern "C" { +#endif + //-------------------------------------- variable byte : 32 bits ---------------------------------------------------------------- - //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 -static unsigned char vtab[]= { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 }; #define vbvlen(__x) vtab[(__x)&0xf] #define vbputa(__op, __x, __act) {\ - if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\ + if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\ else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\ else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\ else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\ else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\ } - + #define vbgeta(__ip, __x, __act) do { __x = *__ip;\ - if(!(__x & (1<<0))) { __x >>= 1; __ip++; __act;}\ - else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\ + if(!(__x & (1<<0))) { __x >>= 1; __ip++; __act;}\ + else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\ else if(!(__x & (1<<2))) { __x = (*(unsigned short *)__ip) >> 3 | *(__ip+2) << 13; __ip += 3; __act;}\ - else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\ - else { __x = (*(unsigned *)__ip) >> 4 | *(__ip+4) << 28; __ip += 5; __act;}\ + else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\ + else { __x = (*(unsigned *)__ip) >> 4 | *(__ip+4) << 28; __ip += 5; __act;}\ } while(0) +//------------------------------------------------------------------------------------------------------------------------ + //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 +static unsigned char vtab[]= { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 }; +// Length of uncompress value. Input __x is the compressed buffer start -#define vblen(_x_) ({ unsigned __x = _x_; __x > 0x7f?(__x > 0x3fff?(__x > 0x1fffff?(__x > 0x0fffffff?5:4):3):2):1; }) +// Length in bytes of compressed "__x" when using variable byte +#define vblen(__x) ({ unsigned _x = __x; _x > 0x7f?(_x > 0x3fff?(_x > 0x1fffff?(_x > 0x0fffffff?5:4):3):2):1; }) + +// compress single value #define vbput(__op, __x) { unsigned _x__ = __x; vbputa(__op, _x__, ;); } +// decompress single value #define vbget(__ip) ({ unsigned _x_; vbgeta(__ip, _x_, ;); _x_; }) -static inline unsigned char *vbenc (unsigned *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) vbput(out, *in++); return out;} -static inline unsigned char *vbdec (unsigned char *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n,x; while(out < out_) vbgeta(in, x, *out++ = x); return in;} +// compress array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out +static inline unsigned char *vbenc (unsigned *__restrict in, int n, unsigned char *__restrict out) { unsigned *in_ = in +n; while(in < in_) vbput(out, *in++); return out;} + +// decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in +static inline unsigned char *vbdec (unsigned char *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n,x; while(out < out_) vbgeta(in, x, *out++ = x); return in;} //--------------------------------------- variable byte : 15 bits ------------------------------------------------------------------- -#define vblen16(__x) ((__x) > 0x7f?2:1) #define vbput16(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0) #define vbgeta16(__ip,__x, __act) do { if((__x = *__ip++) > 0x7f) __x = (__x & 0x7f) << 8 | *__ip++; __act; } while(0) + +#define vblen16(__x) ((__x) > 0x7f?2:1) #define vbget16(__ip) ({ unsigned _x; vbgeta16(__ip, _x, ;); _x; }) -static inline unsigned char *vbenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned short *in_ = in +n; while(in < in_) vbput16(out, *in++); return out;} -static inline unsigned char *vbdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out) { unsigned short *out_ = out+n,x; while(out < out_) vgeta16(in, x, *out++ = x); return in; } +// like vbenc32 but for 16 bits values +static inline unsigned char *vbenc16(unsigned short *__restrict in, int n, unsigned char *__restrict out) { unsigned short *in_ = in +n; while(in < in_) vbput16(out, *in++); return out;} +// like vbdec32 but for 16 bits values +static inline unsigned char *vbdec16(unsigned char *__restrict in, int n, unsigned short *__restrict out) { unsigned short *out_ = out+n,x; while(out < out_) vgeta16(in, x, *out++ = x); return in; } +#ifdef __cplusplus +} #endif - - +#endif diff --git a/vp4dc.c b/vp4dc.c index 17d323f..91047c0 100644 --- a/vp4dc.c +++ b/vp4dc.c @@ -16,7 +16,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo @@ -24,18 +24,22 @@ vp4dd.c - "Integer Compression" Turbo PforDelta **/ +#include #include "conf.h" #include "bitpack.h" + #include "vp4dc.h" #define PAD8(__x) ( (((__x)+8-1)/8) ) #include +//------------------------------------------ +#define P4DSIZE 128 //64 // +#define P4DENC p4denc #define USIZE 32 #include "vp4dc_.h" +#undef USIZE #define USIZE 16 #include "vp4dc_.h" - - diff --git a/vp4dc.h b/vp4dc.h index e23a94b..9c81cb9 100644 --- a/vp4dc.h +++ b/vp4dc.h @@ -16,12 +16,21 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - vp4dc.h - "Integer Compression" Turbo PforDelta + vp4dc.h - "Integer Compression" TurboPfor (see vp4dd.h for decompression) **/ -unsigned char *p4denc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); -unsigned char *p4denc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out); +#ifdef __cplusplus +extern "C" { +#endif + +// compress integer array with n values to the buffer out. Return value = end of compressed buffer out +unsigned char *p4denc32(unsigned *__restrict in, int n, unsigned char *__restrict out); +unsigned char *p4denc16(unsigned short *__restrict in, int n, unsigned char *__restrict out); + +#ifdef __cplusplus +} +#endif diff --git a/vp4dc_.h b/vp4dc_.h index 75fd9f3..86ac68c 100644 --- a/vp4dc_.h +++ b/vp4dc_.h @@ -16,7 +16,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo @@ -24,15 +24,23 @@ vp4dc_.c - "Integer Compression" Turbo PforDelta **/ #define uint_t TEMPLATE3(uint, USIZE, _t) +#define P4DN (P4DSIZE/64) -unsigned char *TEMPLATE2(p4denc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ out) { - int i; unsigned cnt[USIZE+1] = {0}; uint_t b = 0; - for(i = 0; i < n; i++) b |= in[i], ++cnt[TEMPLATE2(bsr, USIZE)(in[i])]; +unsigned char *TEMPLATE2(P4DENC, USIZE)(uint_t *__restrict in, int n, unsigned char *__restrict out) { unsigned char *op = out; + int i,b=0; unsigned cnt[USIZE+1] = {0}; uint_t *ip; + + for(ip = in; ip < in+(n&~3); ) { + ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; + ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; + ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; + ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++; + } + while(ip < in+n) b |= *ip, ++cnt[TEMPLATE2(bsr, USIZE)(*ip++)]; b = TEMPLATE2(bsr, USIZE)(b); - unsigned xb=b, ml = PAD8(n*b)+1,x = cnt[b]; + unsigned xb = b, ml = PAD8(n*b)+1,x = cnt[b]; for(i = b-1; i >= 0; i--) { - unsigned l = PAD8(n*i) + (x?(2+16+PAD8(x*(xb-i))):1); + unsigned l = PAD8(n*i) + 2+P4DN*8+PAD8(x*(xb-i)); if(l < ml) b = i, ml = l; x += cnt[i]; /*if(x >= 64) break;*/ } @@ -40,9 +48,10 @@ unsigned char *TEMPLATE2(p4denc, USIZE)(uint_t *__restrict__ in, int n, unsigned *out++ = b << 1; return TEMPLATE2(bitpack, USIZE)(in, n, b, out); } - xb-=b; - uint_t _in[0x100], inx[0x100]; unsigned miss[0x100]; - unsigned long long xmap[2]; xmap[0] = xmap[1] = 0; unsigned xn, msk = (1ull<> b; - xmap[c>>6] |= (1ull<<(c&0x3f)); + xmap[c>>6] |= (1ull << (c&0x3f)); } - *(unsigned short *)out = xb << 8 | b << 1 | 1; out += 2; out = TEMPLATE2(bitpack, USIZE)(_in, n, b, out); - *(unsigned long long *)out = xmap[0]; out += 8; - *(unsigned long long *)out = xmap[1]; out += 8; - memset(&inx[xn],0,128); - return TEMPLATE2(bitpack, USIZE)(inx, xn, xb, out); + *(unsigned short *)out = xb << 8 | b << 1 | 1; out += 2; out = TEMPLATE2(bitpack, USIZE)(_in, n, b, out); + for(i=0;i < P4DN; i++) { *(unsigned long long *)out = xmap[i]; out += 8; } //memset(&inx[xn],0,P4DSIZE); + return TEMPLATE2(bitpack, USIZE)(inx, xn, xb, out); //if(op-out >= PAD8(n*b)+1) { printf("Fatal error b=%d,xb=%d\n", b, xb); exit(0); } return out; } diff --git a/vp4dd.c b/vp4dd.c index 2d9e452..855013f 100644 --- a/vp4dd.c +++ b/vp4dd.c @@ -16,7 +16,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo @@ -24,17 +24,20 @@ vp4dd.c - "Integer Compression" Turbo PforDelta **/ +#include #include "conf.h" #include "bitunpack.h" #include "vp4dd.h" - + #define PAD8(__x) ( (((__x)+8-1)/8) ) -#include + #define USIZE 32 #include "vp4dd_.h" +#undef USIZE -//#define USIZE 16 -//#include "vp4dd_.h" +#define USIZE 16 +#include "vp4dd_.h" +#undef USIZE diff --git a/vp4dd.h b/vp4dd.h index 71af111..fcb740c 100644 --- a/vp4dd.h +++ b/vp4dd.h @@ -16,33 +16,43 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo vp4dd.h - "Integer Compression" Turbo PforDelta **/ -unsigned char *p4ddec32( unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); -unsigned char *p4ddecx32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); -//----------------------------------------------------------------------- +#ifdef __cplusplus +extern "C" { +#endif + +#define P4DSIZE 128 //64 // +#define P4DN (P4DSIZE/64) + +//---------------- Bulk decompress of TurboPFor compressed integer array ------------------------------------------------------- +// decompress a previously (with p4denc32) 32 bits packed array. Return value = end of packed buffer in +unsigned char *p4ddec32( unsigned char *__restrict in, int n, unsigned *__restrict out); + +//---------------- Direct Access functions to compressed TurboPFor array ------------------------------------------------------- #define P4D_PAD8(__x) ( (((__x)+8-1)/8) ) #define P4D_XB(__x) ((__x & 1)?(__x >> 8):0) #define P4D_B(__x) ((__x >> 1) & 0x3f) -#define P4D_ININC(__in, __x) __in += 1+(__x & 1) +#define P4D_ININC(__in, __x) __in += 1+(__x & 1) -static inline unsigned vp4dbits(unsigned char *__restrict__ in, int *xb) { unsigned i = *(unsigned short *)in; *xb = P4D_XB(i); return P4D_B(i); } +static inline unsigned vp4dbits(unsigned char *__restrict in, int *xb) { unsigned i = *(unsigned short *)in; *xb = P4D_XB(i); return P4D_B(i); } struct p4d { unsigned long long *xmap; unsigned char *ex; - unsigned i,xb,cum[2]; + unsigned i,xb,cum[P4DN+1]; int oval,idx; }; -static inline void p4dini(struct p4d *p4d, unsigned char **__restrict__ pin, int n, unsigned *b) { unsigned char *in = *pin; - static unsigned long long xmap[2] = { 0 }; +// prepare direct access usage +static inline void p4dini(struct p4d *p4d, unsigned char *__restrict *pin, int n, unsigned *b) { unsigned char *in = *pin; + static unsigned long long xmap[P4DN+1] = { 0 }; unsigned i = *(unsigned short *)in; p4d->i = i; @@ -52,22 +62,33 @@ static inline void p4dini(struct p4d *p4d, unsigned char **__restrict__ pin, int *pin = in; p4d->ex = in + P4D_PAD8(n*(*b)); - p4d->xmap = (i&1)?p4d->ex:xmap; - p4d->ex += (i&1)?16:0; + p4d->xmap = (i&1)?(unsigned long long *)p4d->ex:xmap; + p4d->ex += (i&1)?8*P4DN:0; p4d->cum[0] = 0; - p4d->cum[1] = popcnt64(p4d->xmap[0]); + for(i=1; i < P4DN; i++) p4d->cum[i] = p4d->cum[i-1] + popcnt64(p4d->xmap[i-1]); p4d->oval = p4d->idx = -1; } -static ALWAYS_INLINE unsigned vp4dget32(struct p4d p4d, unsigned char *__restrict__ in, unsigned b, unsigned idx) { unsigned bi, cl, u = _bitgetx32(in, b, idx*b); - if(unlikely(p4d.xmap[bi = idx>>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<oval += vp4dget(*p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; } +// Get the next single value greater of equal to val +static ALWAYS_INLINE int vp4dgeq(struct p4d *p4d, unsigned char *__restrict in, unsigned b, int val) { do p4d->oval += vp4dget(*p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; } +/* like p4ddec32 but using direct access. This is only a demo showing direct access usage. Use p4ddec32 for instead for decompressing entire blocks */ +unsigned char *p4ddecx32(unsigned char *__restrict in, int n, unsigned *__restrict out); +unsigned char *p4dfdecx32(unsigned char *__restrict in, int n, unsigned start, unsigned *__restrict out); +unsigned char *p4df0decx32(unsigned char *__restrict in, int n, unsigned start, unsigned *__restrict out); + +#ifdef __cplusplus +} +#endif diff --git a/vp4dd_.h b/vp4dd_.h index f92ce5f..720e533 100644 --- a/vp4dd_.h +++ b/vp4dd_.h @@ -16,14 +16,15 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo vp4dd_.h - "Integer Compression" Turbo PforDelta **/ - #ifdef __AVX2__ + + #ifdef USE__AVX2__ // disabled per default. #include static ALIGNED(unsigned char, shuffles[256][8], 32) = { @@ -291,23 +292,43 @@ static ALIGNED(unsigned char, shuffles[256][8], 32) = { #define uint_t TEMPLATE3(uint, USIZE, _t) -unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict__ in, int n, uint_t *__restrict__ out) { - uint_t ex[0x100+8]; unsigned i = *(unsigned short *)in; uint_t b = P4D_B(i); unsigned xb = P4D_XB(i); +unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict in, int n, uint_t *__restrict out) { + uint_t ex[0x100+8]; + unsigned i = *(unsigned short *)in; + uint_t b = P4D_B(i); + unsigned xb = P4D_XB(i); P4D_ININC(in,i); - in = TEMPLATE2(bitunpack, USIZE)(in, n, b, out); - if(i & 1) { - unsigned long long b0 = *(unsigned long long *)in; in += 8; unsigned long long b1 = *(unsigned long long *)in; in += 8; - in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(b0) + popcnt64(b1), xb, ex); - #ifdef __AVX2__ - unsigned *op,*pex = ex; - for(op = out; b0; b0 >>= 8,op += 8) { const unsigned m = (unsigned char)b0, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; - _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; - } - for(op = out+64; b1; b1 >>= 8,op += 8) { const unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; - _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; - } - #elif defined(__SSE4_1__) + in = TEMPLATE2(bitunpack, USIZE)(in, n, b, out); + if(i & 1) { + #if P4DN == 2 + unsigned long long bb[P4DN]; unsigned num=0; + bb[0] = *(unsigned long long *)in; in += 8; + bb[1] = *(unsigned long long *)in; in += 8; + in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(bb[0]) + popcnt64(bb[1]), xb, ex); + #else + unsigned long long bb[P4DN]; unsigned num=0; + for(i = 0; i < P4DN; i++) { bb[i] = *(unsigned long long *)in; in += 8; num += popcnt64(bb[i]); } + in = TEMPLATE2(bitunpack, USIZE)(in, num, xb, ex); + #endif + + #if 0 //def __AVX2__ + uint_t *op,*pex = ex; + #if 0 //P4DN == 2 + for(op = out; b0; b0 >>= 8,op += 8) { unsigned m = (unsigned char)b0, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } + for(op = out+64; b1; b1 >>= 8,op += 8) { unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } + #else + for(i = 0; i < P4DN; i++) { + for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } out += 64; + } + #endif + #elif defined(__SSE4_1__) static ALIGNED(char, shuffles[16][16], 16) = { #define _ 0x80 { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, @@ -328,23 +349,38 @@ unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict__ in, int n, u { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, #undef _ }; - unsigned *op,*pex = ex; - for(op = out; b0; b0 >>= 4,op+=4) { const unsigned m = b0&0xf; + uint_t *op,*pex = ex; + + #if P4DN == 2 + for(op = out; bb[0]; bb[0] >>= 4,op+=4) { const unsigned m = bb[0]&0xf; _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); } - for(op=out+64; b1; b1 >>= 4,op+=4) { const unsigned m = b1&0xf; + for(op=out+64; bb[1]; bb[1] >>= 4,op+=4) { const unsigned m = bb[1]&0xf; _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); } + #else + for(i = 0; i < P4DN; i++) { // Loop unrolling + for(op = out; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf; + _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); + } out+=64; + } + #endif #else unsigned k = 0; - while(b0) { unsigned x = ctzll(b0); out[x] += ex[k++]<i&1)?(p4d->xmap+2):p4d->in+ PAD8(n*xb); +unsigned char *TEMPLATE2(p4dfdecx, USIZE)(unsigned char *__restrict in, int n, unsigned start, uint_t *__restrict out) { + unsigned b,i; + struct p4d p4d; + p4dini(&p4d, &in, n, &b); + + if(unlikely(p4d.i & 1)) { + for(i = 0; i < n; i++) + out[i] = TEMPLATE2(vp4dget, USIZE)(p4d, in, b, i)+start+i+1; + return p4d.ex + PAD8((p4d.cum[P4DN-1] + popcnt64(p4d.xmap[P4DN-1]))*p4d.xb); + } else { + for(i = 0; i < n; i++) out[i] = TEMPLATE2(_bitgetx, USIZE)(in, b, i*b)+start+i+1; + return p4d.ex; + } } - #endif + +unsigned char *TEMPLATE2(p4df0decx, USIZE)(unsigned char *__restrict in, int n, unsigned start, uint_t *__restrict out) { + unsigned b,i; + struct p4d p4d; + p4dini(&p4d, &in, n, &b); + + if(unlikely(p4d.i & 1)) { + for(i = 0; i < n; i++) + out[i] = TEMPLATE2(vp4dget, USIZE)(p4d, in, b, i)+start; + return p4d.ex + PAD8((p4d.cum[P4DN-1] + popcnt64(p4d.xmap[P4DN-1]))*p4d.xb); + } else { + for(i = 0; i < n; i++) out[i] = TEMPLATE2(_bitgetx, USIZE)(in, b, i*b)+start; + return p4d.ex; + } +} + + diff --git a/vsimple.c b/vsimple.c index f8bff77..d0709dc 100644 --- a/vsimple.c +++ b/vsimple.c @@ -16,7 +16,7 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo @@ -25,18 +25,21 @@ **/ #include "vsimple.h" - + #define USE_RLE // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 #define SV_LIM unsigned char s_lim[] = { 0, 28, 28, 28, 28, 36, 36, 36, 36, 36, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 0 }; -#define SV_ITM unsigned s_itm[] = { -1, 28, 14, 9, 7, 7, 6, 5, 4, 4, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, -1 } +#define SV_ITM unsigned s_itm[] = { 0, 28, 14, 9, 7, 7, 6, 5, 4, 4, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, -1 } static SV_ITM; static SV_LIM; #include #define USIZE 32 #include "vsimple_.h" +#undef USIZE #define USIZE 16 #include "vsimple_.h" +#undef USIZE + diff --git a/vsimple.h b/vsimple.h index b1684f4..4eeb26f 100644 --- a/vsimple.h +++ b/vsimple.h @@ -16,27 +16,33 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo - vsimple.h - "Integer Compression" variable simple + vsimple.h - "Integer Compression" variable simple "SimpleV" + this belongs to the integer compression known as "simple family", like simple-9,simple-16 + or simple-8b. SimpleV is compressing integers in groups into variable word size 32, 40 and 64 bits + RLE (run length encoding) + SimpleV is faster than simple-16 and compress better than simple-16 or simple-8b. **/ #ifdef __cplusplus extern "C" { #endif -unsigned char *vsenc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); -unsigned char *vsdec32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); +// compress array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out +unsigned char *vsenc32(unsigned *__restrict in, int n, unsigned char *__restrict out); -unsigned char *vsenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out); -unsigned char *vsdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out); +// decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in +unsigned char *vsdec32(unsigned char *__restrict in, int n, unsigned *__restrict out); + +// like vsenc32 but for 16 bits values +unsigned char *vsenc16(unsigned short *__restrict in, int n, unsigned char *__restrict out); + +// like vsdec32 but for 16 bits values +unsigned char *vsdec16(unsigned char *__restrict in, int n, unsigned short *__restrict out); #ifdef __cplusplus } #endif - - - diff --git a/vsimple_.h b/vsimple_.h index 59f1dbe..891efda 100644 --- a/vsimple_.h +++ b/vsimple_.h @@ -16,29 +16,38 @@ with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - email : powturbo@gmail.com + - email : powturbo [AT] gmail.com - github : https://github.com/powturbo - homepage : https://sites.google.com/site/powturbo/ - twitter : https://twitter.com/powturbo vsimple_.h - "Integer Compression" variable simple **/ - +#include +#include +#include "conf.h" #include "vint.h" #define uint_t TEMPLATE3(uint, USIZE, _t) -unsigned char *TEMPLATE2(vsenc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ op) { - unsigned xm,m,r; +unsigned char *TEMPLATE2(vsenc, USIZE)(uint_t *__restrict in, int n, unsigned char *__restrict op) { + unsigned xm,m,r,x; uint_t *e = in+n,*ip; for(ip = in; ip < e; ) { #ifdef USE_RLE - if(ip < e-4 && *ip == *(ip+1)) { uint_t *q = ip+1; while(q < e-1 && *(q+1) == *ip) q++; r = q - ip; - if(r*TEMPLATE2(bsr, USIZE)(*ip) > 16 || !*ip && r>4) { m = (*ip)?33:0; goto a; } + if(ip+4 < e && *ip == *(ip+1)) { + uint_t *q = ip+1; + while(q+1 < e && *(q+1) == *ip) q++; + r = q - ip; + if(r*TEMPLATE2(bsr, USIZE)(*ip) > 16 || !*ip && r>4) { + m = (*ip)?33:0; + goto a; + } } else #endif - r = 0; unsigned x = m = bsr32(*ip); - while((r+1)*(xm = x > m?x:m) <= s_lim[xm]) { m = xm; x = TEMPLATE2(bsr, USIZE)(*(ip+(++r))); } - if(/*xm != 32 &&*/ m) while(r < s_itm[m]) m++; + r = 0; + for(m = x = TEMPLATE2(bsr, USIZE)(*ip);(r+1)*(xm = x > m?x:m) <= s_lim[xm] && ip+r>4)&0xf; ip++; + unsigned r = (w>>4)&0xf; ip++; if(unlikely(r == 0xf)) { if(n <= 0x100) r = (w>>8)&0xff, ip++; @@ -247,7 +258,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui } break; case 1: OP( 0) = (w >> 4) & 1; - OP( 1) = (w >> 5) & 1; + OP( 1) = (w >> 5) & 1; OP( 2) = (w >> 6) & 1; OP( 3) = (w >> 7) & 1; OP( 4) = (w >> 8) & 1; @@ -273,7 +284,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui OP(24) = (w >> 28) & 1; OP(25) = (w >> 29) & 1; OP(26) = (w >> 30) & 1; - OP(27) = (w >> 31) & 1; OPI( 28); ip+=4; + OP(27) = (w >> 31) & 1; OPI( 28); ip+=4; break; case 2: OP( 0) = (w >> 4) & 3; @@ -289,7 +300,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui OP(10) = (w >> 24) & 3; OP(11) = (w >> 26) & 3; OP(12) = (w >> 28) & 3; - OP(13) = (w >> 30) & 3; OPI( 14); ip+=4; + OP(13) = (w >> 30) & 3; OPI( 14); ip+=4; break; case 3: OP( 0) = (w >> 4) & 7; @@ -300,7 +311,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui OP( 5) = (w >> 19) & 7; OP( 6) = (w >> 22) & 7; OP( 7) = (w >> 25) & 7; - OP( 8) = (w >> 28) & 7; OPI( 9); ip+=4; + OP( 8) = (w >> 28) & 7; OPI( 9); ip+=4; break; case 4: OP( 0) = (w >> 4) & 0xf; @@ -326,7 +337,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui OP(2) = (w >> 16) & 0x3f; OP(3) = (w >> 22) & 0x3f; OP(4) = (w >> 28) & 0x3f; - OP(5) = (w >> 34) & 0x3f; OPI( 6); ip+=5; + OP(5) = (w >> 34) & 0x3f; OPI( 6); ip+=5; break; case 7: @@ -353,7 +364,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui OP(0) = (w >> 4) & 0x1ff; OP(1) = (w >> 13) & 0x1ff; OP(2) = (w >> 22) & 0x1ff; - OP(3) = (w >> 31) & 0x1ff; OPI( 4); ip+=5; + OP(3) = (w >> 31) & 0x1ff; OPI( 4); ip+=5; break; case 10: @@ -362,32 +373,32 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui OP(2) = (w >> 24) & 0x3ff; OP(3) = (w >> 34) & 0x3ff; OP(4) = (w >> 44) & 0x3ff; - OP(5) = (w >> 54) & 0x3ff; OPI( 6); ip+=8; + OP(5) = (w >> 54) & 0x3ff; OPI( 6); ip+=8; break; case 12: - OP(0) = (w >> 4) & 0xfff; - OP(1) = (w >> 16) & 0xfff; - OP(2) = (w >> 28) & 0xfff; - OP(3) = (w >> 40) & 0xfff; - OP(4) = (w >> 52) & 0xfff; OPI( 5); ip+=8; + OP(0) = (w >> 4) & 0xfffu; + OP(1) = (w >> 16) & 0xfffu; + OP(2) = (w >> 28) & 0xfffu; + OP(3) = (w >> 40) & 0xfffu; + OP(4) = (w >> 52) & 0xfffu; OPI( 5); ip+=8; break; case 15: - OP(0) = (w >> 4) & 0x7fff; - OP(1) = (w >> 19) & 0x7fff; - OP(2) = (w >> 34) & 0x7fff; - OP(3) = (w >> 49) & 0x7fff; OPI( 4); ip+=8; + OP(0) = (w >> 4) & 0x7fffu; + OP(1) = (w >> 19) & 0x7fffu; + OP(2) = (w >> 34) & 0x7fffu; + OP(3) = (w >> 49) & 0x7fffu; OPI( 4); ip+=8; break; case 11: - OP(0) = (w >> 4) & 0xfffff; // 20 - OP(1) = (w >> 24) & 0xfffff; - OP(2) = (w >> 44) & 0xfffff; OPI( 3); ip+=8; + OP(0) = (w >> 4) & 0xfffffu; // 20 + OP(1) = (w >> 24) & 0xfffffu; + OP(2) = (w >> 44) & 0xfffffu; OPI( 3); ip+=8; break; case 13: - OP(0) = (w >> 4) & ((1<<30)-1); - OP(1) = (w >> 34) & ((1<<30)-1); OPI( 2); ip+=8; - break; + OP(0) = (w >> 4) & 0x3fffffffu; + OP(1) = (w >> 34) & 0x3fffffffu; OPI( 2); ip+=8; + break; case 14: - OP(0) = (w >> 4) & ((1ull<<32)-1); OPI( 1); ip+=5; + OP(0) = (w >> 4) & 0xffffffffu; OPI( 1); ip+=5; break; } }