diff --git a/README.md b/README.md
index 5fbe05e..8192776 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,30 @@
TurboPFor: Fastest Integer Compression [](https://travis-ci.org/powturbo/TurboPFor)
======================================
-- 100% C, without inline assembly
+- 100% C/C++, without inline assembly
- Fastest **"Variable Byte"** implementation
- Novel **"Variable Simple"** faster than simple16 and more compact than simple64
-- Scalar **"Binary Packing"** with bulk decoding as fast as SIMD FastPFor in realistic (No "pure cache") scenarios
-- Binary Packing with **Direct/Random Access** without decompressing entire blocks
-- Access any single binary packed entry with **zero decompression**
+- Scalar **"Bit Packing"** with bulk decoding as fast as SIMD FastPFor in realistic and practical (No "pure cache") scenarios
+- Bit Packing with **Direct/Random Access** without decompressing entire blocks
+- Access any single bit packed entry with **zero decompression**
+- Reducing **Cache Pollution**
-- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding
+- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding.
+ Outstanding compression
- Several times faster than other libraries
-- Usage as easy as memcpy
-- Instant access to compressed *frequency* and *position* data in inverted index with zero decoding
-
+- Usage in C/C++ as easy as memcpy
+- Most functions optimized for speed and others for high compression ratio
+- **New:** Include more functions
+
+- Instant access to compressed *frequency* and *position* data in inverted index with zero decompression
+- **New:** Inverted Index Demo + Benchmarks: Intersection of lists of sorted integers.
+- more than **1000 queries per second** on gov2 (25 millions documents) on a **SINGLE** core.
+- Decompress only the minimum necessary blocks.
+
# Benchmark:
i7-2600k at 3.4GHz, gcc 4.9, ubuntu 14.10.
- Single thread
@@ -47,23 +55,63 @@ coming soon!
## Compile:
make
-## Usage
+## Benchmark
###### Synthetic data:
- 1. test all functions
+ 1. test all functions
./icbench -a1.0 -m0 -x8 -n100000000
- zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution)
- number of integers = 100000000
- integer range from 0 to 255 (integer size = 0 to 8 bits)
- 2. individual function test (ex. copy TurboPack TurboPack Direct access)
- ./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopack,da -n100000000
+ 2. individual function test (ex. copy TurboPack TurboPack Direct access)
+ ./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopackda -n100000000
###### Data files:
- - Data file Benchmark (file format as in FastPFOR)
- ./icbench -n10000000000 clueweb09.sorted
+ - Data file Benchmark (file format as in FastPFOR)
+ ./icbench gov2.sorted
+
+###### Benchmarking intersections
+ - Download "gov2.sorted" (or clueweb09) + query file "aol.txt"
+ from "http://lemire.me/data/integercompression2014.html"
+
+ - Create index file gov2.sorted.i
+ ./idxcr gov2.sorted .
+ create inverted index file "gov2.sorted.i" in the current directory
+
+ - Benchmarking intersections
+ ./idxqry gov2.sorted.i aol.txt
+ run queries in file "aol.txt" over the index of gov2 file
+
+ 8GB Minimum of RAM required (16GB recommended for benchmarking "clueweb09" files).
+
+
+## Function usage:
+In general compression/decompression functions are of the form:
+
+ char *endptr = compress( unsigned *in, int n, char *out)
+ endptr : set by compress to the next character in "out" after the compressed buffer
+ in : input integer array
+ n : number of elements
+ out : pointer to output buffer
+
+ char *endptr = decompress( char *in, int n, unsigned *out)
+ endptr : set by decompress to the next character in "in" after the decompressed buffer
+ in : pointer to input buffer
+ n : number of elements
+ out : output integer array
+
+header files with documentation :
+ vint.h - variable byte
+ vsimple.h - variable simple
+ vp4dc.h,vp4dd.h - TurboPFor
+ bitpack.h,bitunpack.h - Bit Packing
+
## Reference:
- - "SIMD-BitPack FPF" from FastPFor https://github.com/lemire/simdcomp
+ - "SIMD-BitPack FPF" from FastPFor https://github.com/lemire/simdcomp
+ - Sorted integer datasets from http://lemire.me/data/integercompression2014.html
- OptP4 and Simple-16 from http://jinruhe.com/
+#----------------------------------------------------------------------------------
+
diff --git a/bitpack.c b/bitpack.c
index e364984..33a8dfb 100644
--- a/bitpack.c
+++ b/bitpack.c
@@ -16,7 +16,7 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
@@ -29,6 +29,6 @@
#define PAD8(__x) ( (((__x)+8-1)/8) )
-unsigned char *bitpack32(unsigned *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; }
-unsigned char *bitpack16(unsigned short *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; }
+unsigned char *bitpack32(unsigned *__restrict in, int n, int nb, unsigned char *__restrict out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; }
+unsigned char *bitpack16(unsigned short *__restrict in, int n, int nb, unsigned char *__restrict out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; }
diff --git a/bitpack.h b/bitpack.h
index 77dee67..86a1431 100644
--- a/bitpack.h
+++ b/bitpack.h
@@ -16,15 +16,17 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- bitpack.c - "Integer Compression" binary packing
+ bitpack.c - "Integer Compression" Binary Packing
**/
-unsigned char *bitpack16( unsigned short *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out);
-unsigned char *bitpack32( unsigned *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out);
+// Pack array with n unsigned (32 bits in[n]) values to the buffer out using nbits per value. Return value = end of compressed buffer out
+unsigned char *bitpack32( unsigned *__restrict in, int n, int nbits, unsigned char *__restrict out);
+// like bitpack32 but for 16 bits arrays
+unsigned char *bitpack16( unsigned short *__restrict in, int n, int nbits, unsigned char *__restrict out);
diff --git a/bitpack64_.h b/bitpack64_.h
index d74b27c..4ce1e8d 100644
--- a/bitpack64_.h
+++ b/bitpack64_.h
@@ -16,12 +16,12 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- bitpack64_.h - "Integer Compression" binary packing
+ bitpack64_.h - "Integer Compression" bit packing include file
**/
#define BITBLK32_1(ip, i, op, parm) { ; register uint32_t w;;\
diff --git a/bitpack_.h b/bitpack_.h
index 3fcabd2..2ca534e 100644
--- a/bitpack_.h
+++ b/bitpack_.h
@@ -16,14 +16,13 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- bitpack_.h - "Integer Compression" binary packing
+ bitpack_.h - "Integer Compression" bit packing
**/
-
#include
#define USE_BITPACK 64
#if 0
@@ -77,8 +76,8 @@
case 32:do BITPACK64_32(__ip, __op, __parm) while(__ip < __ipe);\
}\
} while(0)
- #elif USE_BITPACK == 32
-#include "bitpack32_.h"
+ #else
+#include "bitpack32_.h" // Not included in the github package
#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\
switch(__nbits) {\
case 0:__ip = __ipe; break;\
@@ -114,87 +113,6 @@
case 30:do BITPACK32_30(__ip, __op, __parm) while(__ip < __ipe); break;\
case 31:do BITPACK32_31(__ip, __op, __parm) while(__ip < __ipe); break;\
case 32:do BITPACK32_32(__ip, __op, __parm) while(__ip < __ipe);\
- } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\
-} while(0)
- #else
- #if 1
-#define SRCI(__ip) __ip+=32
-#define SRC(__ip,__x) __ip[__x]
-#define SRCP( __ip)
- #else
-#define SRCI(__ip)
-#define SRC( __ip,__x) (*__ip++)
-//#define SRCP( __ip) (__ip++)
- #endif
-#include "pack/bitpack32_1.h"
-#include "pack/bitpack32_2.h"
-#include "pack/bitpack32_3.h"
-#include "pack/bitpack32_4.h"
-#include "pack/bitpack32_5.h"
-#include "pack/bitpack32_6.h"
-#include "pack/bitpack32_7.h"
-#include "pack/bitpack32_8.h"
-#include "pack/bitpack32_9.h"
-#include "pack/bitpack32_10.h"
-#include "pack/bitpack32_11.h"
-#include "pack/bitpack32_12.h"
-#include "pack/bitpack32_13.h"
-#include "pack/bitpack32_14.h"
-#include "pack/bitpack32_15.h"
-#include "pack/bitpack32_16.h"
-#include "pack/bitpack32_17.h"
-#include "pack/bitpack32_18.h"
-#include "pack/bitpack32_19.h"
-#include "pack/bitpack32_20.h"
-#include "pack/bitpack32_21.h"
-#include "pack/bitpack32_22.h"
-#include "pack/bitpack32_23.h"
-#include "pack/bitpack32_24.h"
-#include "pack/bitpack32_25.h"
-#include "pack/bitpack32_26.h"
-#include "pack/bitpack32_27.h"
-#include "pack/bitpack32_28.h"
-#include "pack/bitpack32_29.h"
-#include "pack/bitpack32_30.h"
-#include "pack/bitpack32_31.h"
-#include "pack/bitpack32_32.h"
-#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\
- switch(__nbits) {\
- case 0:__ip = __ipe; break;\
- case 1:do BITPACK_1( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 2:do BITPACK_2( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 3:do BITPACK_3( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 4:do BITPACK_4( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 5:do BITPACK_5( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 6:do BITPACK_6( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 7:do BITPACK_7( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 8:do BITPACK_8( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 9:do BITPACK_9( __ip, __op, __parm) while(__ip < __ipe); break;\
- case 10:do BITPACK_10(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 11:do BITPACK_11(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 12:do BITPACK_12(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 13:do BITPACK_13(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 14:do BITPACK_14(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 15:do BITPACK_15(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 16:do BITPACK_16(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 17:do BITPACK_17(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 18:do BITPACK_18(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 19:do BITPACK_19(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 20:do BITPACK_20(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 21:do BITPACK_21(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 22:do BITPACK_22(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 23:do BITPACK_23(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 24:do BITPACK_24(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 25:do BITPACK_25(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 26:do BITPACK_26(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 27:do BITPACK_27(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 28:do BITPACK_28(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 29:do BITPACK_29(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 30:do BITPACK_30(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 31:do BITPACK_31(__ip, __op, __parm) while(__ip < __ipe); break;\
- case 32:do BITPACK_32(__ip, __op, __parm) while(__ip < __ipe);\
- } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\
+ }\
} while(0)
#endif
-//
-
diff --git a/bitunpack.c b/bitunpack.c
index 830ad4b..14550ad 100644
--- a/bitunpack.c
+++ b/bitunpack.c
@@ -16,41 +16,50 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- bitunpack_.h - "Integer Compression" binary packing
+ bitunpack_.h - "Integer Compression" Binary Packing
**/
-
+#include "conf.h"
#include "bitunpack.h"
-
#define PAD8(__x) (((__x)+7)/8)
-unsigned char * bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i; for(i=0; i < n; i++ ) out[i] = bitgetx32(in, b, i); return in + PAD8(n*b); }
-unsigned char *_bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); }
-#define BPI(__w,__parm) __w
+//-----------------------------------------------------------------------------------------------------------------
+#define BPI(__w, __op, __parm) __w
#include "bitunpack_.h"
-unsigned char *bitunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; }
-unsigned char *bitunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned short *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; }
+unsigned char *bitunpack32( unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; }
+unsigned char *bitunpack16( unsigned char *__restrict in, unsigned n, unsigned b, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return ip; }
+#undef BPI
+
+//-----------------------------------------------------------------------------------------------------------------
+#define BPI(__w, __op, __parm) (__parm += (__w) + 1)
+#include "bitunpack_.h"
+unsigned char *bitdunpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitdunpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
#undef BPI
//------------------------------------------------------------------------------------------
-#define BPI(__w,__parm) (__parm += (__w) + 1)
+#define BPI(__w, __op, __parm) (__parm += (__w))
#include "bitunpack_.h"
-
-unsigned char *bitdunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; }
-unsigned char *bitdunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
-unsigned char *bitdunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; }
+unsigned char *bitd0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitd0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
#undef BPI
//------------------------------------------------------------------------------------------
-#define BPI(__w,__parm) (__parm + (__w) + 1)
+#define BPI(__w, __op, __parm) (__parm + (__op+1-_op))//#define BPI(__w, __op, __parm) (__parm + (__w) + 1)
#include "bitunpack_.h"
-
-unsigned char *bitfunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; }
-unsigned char *bitfunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+1; return in + PAD8(n*b); }
-unsigned char *bitfunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; }
+unsigned char *bitfunpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitfunpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+#undef BPI
+
+//------------------------------------------------------------------------------------------
+#define BPI(__w, __op, __parm) (__parm + (__op-_op))
+#include "bitunpack_.h"
+
+unsigned char *bitf0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
+unsigned char *bitf0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out) { unsigned char *ip = in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return ip; }
#undef BPI
diff --git a/bitunpack.h b/bitunpack.h
index ff1054c..bc572ff 100644
--- a/bitunpack.h
+++ b/bitunpack.h
@@ -16,36 +16,63 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- bitunpack.h - "Integer Compression" binary packing
+ bitunpack.h - "Integer Compression" Binary Packing
**/
+#ifdef __cplusplus
+extern "C" {
+#endif
-// BP
-static inline unsigned bitgetx32(unsigned *__restrict__ in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return ((*(unsigned long long *)(in+(bidx>>5))) >> (bidx&0x1f)) & ((1ull<>5))) >> (bidx&0x1f)) & ((1ull<>4))) >> (bidx& 0xf)) & ((1 <>4))) >> (bidx& 0xf)) & ((1 <
+ #else
+#define _bzhi_u64(__u, __b) ((__u) & ((1ull<<__b)-1))
+#define _bzhi_u32(__u, __b) ((__u) & ((1u <<__b)-1))
+ #endif
-unsigned char * bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out);
-unsigned char *_bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out);
+// Get a single 32 bits value with index "idx" (or bit index b*idx) from packed integer array
+static ALWAYS_INLINE unsigned bitgetx32(unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return _bzhi_u64( (*(unsigned long long *)((unsigned *)in+(bidx>>5))) >> (bidx&0x1f), b ); }
+static ALWAYS_INLINE unsigned _bitgetx32(unsigned char *__restrict in, unsigned b, unsigned bidx) { return _bzhi_u64( (*(unsigned long long *)((unsigned *)in+(bidx>>5))) >> (bidx&0x1f), b ); }
+
+// like bitgetx32 but for 16 bits integer array
+static ALWAYS_INLINE unsigned bitgetx16(unsigned char *__restrict in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return _bzhi_u32( (*(unsigned *)((unsigned *)in+(bidx>>4))) >> (bidx& 0xf), b ); }
+static ALWAYS_INLINE unsigned _bitgetx16(unsigned char *__restrict in, unsigned b, unsigned bidx) { return _bzhi_u32( (*(unsigned *)((unsigned *)in+(bidx>>4))) >> (bidx& 0xf), b ); }
-// DFOR
-unsigned char *bitdunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out);
-unsigned char *bitdunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out);
-unsigned char *bitdunpackb32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out);
+// Set a single value with index "idx"
+static ALWAYS_INLINE void bitsetx32(unsigned char *__restrict in, unsigned b, unsigned idx, unsigned v) { unsigned bidx = b*idx; unsigned long long *p = (unsigned long long *)((unsigned *)in+(bidx>>5)); *p = ( *p & ~(((1ull<>4) ; *p = ( *p & ~(((1u <= val) { *oidx=idx; return oval; } } return INT_MAX; }
+// out[0] = start + in[0]; out[1] = out[0] + in[1]; ... ; out[i] = out[i-1] + in[i]
+unsigned char *bitd0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out);
+unsigned char *bitd0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out);
+
+// ---------------- DaFor : Direct Access for packed SORTED array (Ex. DocId in inverted index) --------------------------------------------
+// out[i] = start + in[i] + i + 1
+unsigned char *bitfunpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out);
+unsigned char *bitfunpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out);
+
+// out[i] = start + in[i] + i
+unsigned char *bitf0unpack32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out);
+unsigned char *bitf0unpack16( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned short *__restrict out);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/bitunpack64_.h b/bitunpack64_.h
index 88ac332..8ad57b9 100644
--- a/bitunpack64_.h
+++ b/bitunpack64_.h
@@ -16,12 +16,12 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- bitunpack64_.c - "Integer Compression" binary packing
+ bitunpack64_.c - "Integer Compression" scalar bit packing
**/
#define BITUNBLK32_0(ip, i, op, parm) { \
@@ -141,7 +141,7 @@
BITUNBLK64_2(ip, 0, op, parm); DSTI(op); ip += 2*4/sizeof(ip[0]);\
}
-#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));register uint32_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\
+#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\
DST(op,i*64+ 0, (w0 ) & 0x7, parm);\
DST(op,i*64+ 1, (w0 >> 3) & 0x7, parm);\
DST(op,i*64+ 2, (w0 >> 6) & 0x7, parm);\
@@ -162,7 +162,7 @@
DST(op,i*64+17, (w0 >> 51) & 0x7, parm);\
DST(op,i*64+18, (w0 >> 54) & 0x7, parm);\
DST(op,i*64+19, (w0 >> 57) & 0x7, parm);\
- DST(op,i*64+20, (w0 >> 60) & 0x7, parm); \
+ DST(op,i*64+20, (w0 >> 60) & 0x7, parm); register uint32_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\
\
DST(op,i*64+21, (w0 >> 63) | (w1 << 1) & 0x7, parm);\
DST(op,i*64+22, (w1 >> 2) & 0x7, parm);\
@@ -181,28 +181,28 @@
BITUNBLK64_3(ip, 0, op, parm); DSTI(op); ip += 3*4/sizeof(ip[0]);\
}
-#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip/*+(i*1+0)*8/sizeof(ip[0])*/);ip += 8/sizeof(ip[0]);\
- DST(op,i*16+ 0, (w0 ) & 0xf, parm);\
- DST(op,i*16+ 1, (w0 >> 4) & 0xf, parm);\
- DST(op,i*16+ 2, (w0 >> 8) & 0xf, parm);\
- DST(op,i*16+ 3, (w0 >> 12) & 0xf, parm);\
- DST(op,i*16+ 4, (w0 >> 16) & 0xf, parm);\
- DST(op,i*16+ 5, (w0 >> 20) & 0xf, parm);\
- DST(op,i*16+ 6, (w0 >> 24) & 0xf, parm);\
- DST(op,i*16+ 7, (w0 >> 28) & 0xf, parm);\
- DST(op,i*16+ 8, (w0 >> 32) & 0xf, parm);\
- DST(op,i*16+ 9, (w0 >> 36) & 0xf, parm);\
- DST(op,i*16+10, (w0 >> 40) & 0xf, parm);\
- DST(op,i*16+11, (w0 >> 44) & 0xf, parm);\
- DST(op,i*16+12, (w0 >> 48) & 0xf, parm);\
- DST(op,i*16+13, (w0 >> 52) & 0xf, parm);\
- DST(op,i*16+14, (w0 >> 56) & 0xf, parm);\
- DST(op,i*16+15, (w0 >> 60), parm);;\
+#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\
+ DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 0, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 1, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
+ DST(op,i*16+ 2, (unsigned char)w0 & 0xf, parm);\
+ DST(op,i*16+ 3, (unsigned char)w0 >> 4, parm); w0 >>= 8;\
}
#define BITUNPACK64_4(ip, op, parm) { \
BITUNBLK64_4(ip, 0, op, parm);\
- BITUNBLK64_4(ip, 1, op, parm); DSTI(op); /*ip += 4*4/sizeof(ip[0]);*/\
+ BITUNBLK64_4(ip, 1, op, parm); DSTI(op); ip += 4*4/sizeof(ip[0]);\
}
#define BITUNBLK64_5(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\
@@ -330,14 +330,14 @@
}
#define BITUNBLK64_8(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\
- DST(op,i*8+ 0, (w0 ) & 0xff, parm);\
- DST(op,i*8+ 1, (w0 >> 8) & 0xff, parm);\
- DST(op,i*8+ 2, (w0 >> 16) & 0xff, parm);\
- DST(op,i*8+ 3, (w0 >> 24) & 0xff, parm);\
- DST(op,i*8+ 4, (w0 >> 32) & 0xff, parm);\
- DST(op,i*8+ 5, (w0 >> 40) & 0xff, parm);\
- DST(op,i*8+ 6, (w0 >> 48) & 0xff, parm);\
- DST(op,i*8+ 7, (w0 >> 56) , parm);;\
+ DST(op,i*8+ 0, (unsigned char)(w0 ), parm);\
+ DST(op,i*8+ 1, (unsigned char)(w0 >> 8), parm);\
+ DST(op,i*8+ 2, (unsigned char)(w0 >> 16), parm);\
+ DST(op,i*8+ 3, (unsigned char)(w0 >> 24), parm);\
+ DST(op,i*8+ 4, (unsigned char)(w0 >> 32), parm);\
+ DST(op,i*8+ 5, (unsigned char)(w0 >> 40), parm);\
+ DST(op,i*8+ 6, (unsigned char)(w0 >> 48), parm);\
+ DST(op,i*8+ 7, (unsigned char)(w0 >> 56), parm);;\
}
#define BITUNPACK64_8(ip, op, parm) { \
diff --git a/bitunpack_.h b/bitunpack_.h
index 172e3d4..5c22dcb 100644
--- a/bitunpack_.h
+++ b/bitunpack_.h
@@ -16,23 +16,22 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
bitunpack_.h - "Integer Compression" binary packing
**/
-
#include
-#define DST( __op,__x, __w, __parm) *__op++ = BPI(__w,__parm) //__op[__x] = BPI(__w,__parm) //
+#define DST( __op,__x, __w, __parm) *__op++ = BPI(__w, __op, __parm) //__op[__x] = BPI(__w,__parm) //
#define DSTI(__op) //__op += 32 //
#define USE_BITUNPACK 64
#if USE_BITUNPACK == 64
#include "bitunpack64_.h"
-#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\
+#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n,*_op=__op;\
switch(__nbits) {\
case 0: do BITUNPACK64_0( __ip, __op, __parm) while(__op<__ope); break;\
case 1: do BITUNPACK64_1( __ip, __op, __parm) while(__op<__ope); break;\
@@ -70,8 +69,8 @@
}\
}
#elif USE_BITUNPACK == 32
-#include "bitunpack32_.h"
-#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\
+#include "bitunpack32_.h" // Not included in the github package
+#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;\
switch(__nbits) {\
case 0: do BITUNPACK32_0( __ip, __op, __parm) while(__op<__ope); break;\
case 1: do BITUNPACK32_1( __ip, __op, __parm) while(__op<__ope); break;\
@@ -106,7 +105,7 @@
case 30: do BITUNPACK32_30(__ip, __op, __parm) while(__op<__ope); break;\
case 31: do BITUNPACK32_31(__ip, __op, __parm) while(__op<__ope); break;\
case 32: do BITUNPACK32_32(__ip, __op, __parm) while(__op<__ope); break;\
- } /*printf("n=%d,%d,%d ", __n, __op, __parm - sd, __op, __parme - __op);*/\
+ }\
}
#endif
diff --git a/conf.h b/conf.h
index 2383ad1..185fbdf 100644
--- a/conf.h
+++ b/conf.h
@@ -16,19 +16,20 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
conf.h - "Integer Compression" config & common
**/
-
+#ifndef CONF_H
+#define CONF_H
#if defined(__GNUC__)
#define ALIGNED(t,v,n) __attribute__ ((aligned (n))) t v
-#define ALWAYS_INLINE __attribute__((always_inline))
-#define _PACKED __attribute__ ((packed))
+#define ALWAYS_INLINE inline __attribute__((always_inline))
+#define _PACKED __attribute__ ((packed))
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
@@ -48,6 +49,11 @@ static inline int bsr32(int x) {
return b + 1;
}
+static inline int __bsr32(int x) {
+ asm("bsr %1,%0" : "=r" (x) : "rm" (x) );
+ return x;
+}
+
static inline int bsr64(unsigned long long x) {
return x?64 - __builtin_clzll(x):0;
}
@@ -66,5 +72,25 @@ static inline int bsr64(unsigned long long x) {
#else
#error "only gcc support in this version"
#endif
+//---------------------------------------------------------------------------------------------------
+#define ctou8(__cp) (*(unsigned char *)(__cp))
+#define ctou16(__cp) (*(unsigned short *)(__cp))
+#define ctou24(__cp) ((*(unsigned *)(__cp)) & 0xffffff)
+#define ctou32(__cp) (*(unsigned *)(__cp))
+#define ctou64(__cp) (*(unsigned long long *)(__cp))
+#define ctou48(__cp) ((*(unsigned long long *)(__cp)) & 0xffffffffffff)
+#define ctou(__cp_t, __cp) (*(__cp_t *)(__cp))
+ #ifndef min
+#define min(x,y) (((x)<(y)) ? (x) : (y))
+#define max(x,y) (((x)>(y)) ? (x) : (y))
+ #endif
+ #ifdef NDEBUG
+#define AS(expr, fmt,args...)
+ #else
+#include
+#define AS(expr, fmt,args...) if(!(expr)) { fflush(stdout);fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); abort(); }
+ #endif
+#define die(fmt,args...) do { fprintf(stderr, "%s:%s:%d:", __FILE__, __FUNCTION__, __LINE__); fprintf(stderr, fmt, ## args ); fflush(stderr); exit(-1); } while(0)
+#endif
diff --git a/ext/OPT_PFD/main.cpp b/ext/OPT_PFD/main.cpp
new file mode 100644
index 0000000..2c0ec06
--- /dev/null
+++ b/ext/OPT_PFD/main.cpp
@@ -0,0 +1,101 @@
+/*
+ * test for OPT-pfd
+ *
+ * Author: sding
+ *
+ *
+ */
+
+
+
+#include
+#include
+#include
+
+#include "opt_p4.h"
+
+using namespace std;
+
+char PATH[128] = "/usr/home/shuai/dumplist/wordlist_Excite"; // for reading list
+
+int get_list(char *term, unsigned int *doc_id, unsigned int *freq, unsigned int *maxc)
+{
+ char fpath[128];
+ sprintf(fpath,"%s/%s",PATH,term);
+ FILE *fdd = fopen(fpath,"r");
+ if(fdd==NULL) return 0;
+
+ int nread, npos;
+
+ nread = fread(&npos, sizeof(unsigned), 1, fdd);
+ npos = 0;
+
+ while (nread > 0)
+ {
+ nread = fread(&doc_id[npos], sizeof(unsigned), 1, fdd);
+ if (nread <= 0) break;
+ fread(&freq[npos], sizeof(unsigned), 1, fdd);
+ npos++;
+ }
+ fclose(fdd);
+
+ int i;
+
+ /* fill out the max values */
+ for (i = 0; i < npos; i += BS)
+ maxc[(i/BS)] = doc_id[i+BS-1];
+
+ /* take the gap for doc_id */
+ for (i = npos-1; i > 0; i--)
+ {
+ doc_id[i] -= doc_id[i-1];
+ doc_id[i] --;
+ }
+
+ for (i = 0; i < npos; i++)
+ freq[i]--;
+ return npos;
+}
+
+int main() // just for testing
+{
+ int MAX_NDOC = 25205179;
+ unsigned int *docid = new unsigned int[MAX_NDOC];
+ unsigned int *docid_check = new unsigned int[MAX_NDOC ];
+
+ unsigned int *fre = new unsigned int[MAX_NDOC];
+ unsigned int *maxc = new unsigned int[MAX_NDOC/BS];
+ unsigned int *aux = new unsigned int[MAX_NDOC];
+ unsigned int * all_array = new unsigned int[2048]; // extra array for coding
+
+
+ int listSize = get_list("information", docid, fre, maxc);
+ cout<<"list size is "< size * 4) // int bytes
+ {
+ chunk_size = size *4;
+ b = l;
+ temp_en = ex_n;
+ }
+ }
+
+ csize += chunk_size;
+ //printf("encode:%u\n", b);
+ p4_encode(doc_id + j, BS, b, aux + offset, &size, &ex_n);
+ offset += size;
+ }
+
+ return csize;
+}
diff --git a/ext/OPT_PFD/pf.h b/ext/OPT_PFD/pf.h
new file mode 100644
index 0000000..788f8cc
--- /dev/null
+++ b/ext/OPT_PFD/pf.h
@@ -0,0 +1,158 @@
+#include "s16head.h"
+#include "unpack.h"
+
+
+#define BS 128
+#define FRAC 0.10
+#define S 16
+#define PCHUNK 128
+
+void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w);
+
+
+int detailed_p4_encode(unsigned int **w, unsigned int* p, int num , int *chunk_size, int * exception_n)
+{
+ int i, j, t, s;
+
+ unsigned int b = cnum[num];
+ int bb_e;
+ int bb_p;
+ int p_low;
+ unsigned int e_n = 0;
+ int max_p = 0;
+ int max_e = 0;
+
+ unsigned int* out = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
+ unsigned int* ex = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
+ unsigned int* po = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2);
+
+ unsigned int* tp = NULL;
+ unsigned int *_pp, *_ww;
+
+ if (b == 32)
+ {
+ (*w)[0] = ((b<<10)) + (0);
+ *w +=1;
+ for (i = 0; i < PCHUNK ; i++) (*w)[i] = p[i];
+ *w += (PCHUNK);
+ (*chunk_size) = 1 + BS;
+
+ free(out);
+ free(ex);
+ free(po);
+ return 0;
+ }
+
+ for (i = 0; i < PCHUNK ; i++)
+ {
+ if ( p[i] >= (1<> b);
+ po[(e_n++)] = i; //
+ }
+ else
+ out[i] = p[i];
+ }
+
+ if (1) // force to pass every time
+ {
+ /*get the gap of position*/
+ for(j = e_n-1;j>0;j--)
+ {
+ po[j] = po[j] - po[j-1] ;
+ po[j] --;
+ }
+
+ s = ((b * PCHUNK)>>5);
+ tp = (*w);
+ (*w)[0] = ((num<<10))+e_n; // record b and number of exceptions into this value, in the other version we pick this value out and did not count it
+ (*w) += 1;
+ for (i = 0; i < s; i++) (*w)[i] = 0;
+ pack(out, b, PCHUNK , *w);
+ *w += s;
+
+ unsigned int *all_array = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*4) ;
+ for(j=0;j>5;
+ s = 32 - b - (bp & 31);
+ if (s >= 0)
+ w[wp] |= (v[i]<>s);
+ w[wp+1] = (v[i]<<(32-s));
+ }
+ }
+}
+
+/*modified p4decode */
+unsigned int *detailed_p4_decode(unsigned int *_p, unsigned int *_w, unsigned int * all_array)
+{
+
+ int i, s;
+ unsigned int x;
+ int flag = _w[0];
+ (_w)++;
+
+ unsigned int *_ww,*_pp;
+ unsigned int b = ((flag>>10) & 31);
+ unsigned int e_n = (flag & 1023) ;
+
+ (unpack[b])(_p, _w);
+
+ b = cnum[b];
+ _w += ((b * BS)>>5);
+ unsigned int _k = 0;
+ unsigned int psum = 0;
+ if(e_n != 0 )
+ {
+ for (_pp = all_array, _ww = (unsigned int *)(_w); _pp < &(all_array[e_n*2]);)
+ {
+ S16_DECODE(_ww, _pp);
+ }
+
+ _w += (_ww - _w);
+ psum = all_array[0];
+
+ for(i=0;i>28; \
+ switch(_k) \
+ { \
+ case 0: \
+ *_p = (*_w) & 1; _p++; \
+ *_p = (*_w>>1) & 1; _p++; \
+ *_p = (*_w>>2) & 1; _p++; \
+ *_p = (*_w>>3) & 1; _p++; \
+ *_p = (*_w>>4) & 1; _p++; \
+ *_p = (*_w>>5) & 1; _p++; \
+ *_p = (*_w>>6) & 1; _p++; \
+ *_p = (*_w>>7) & 1; _p++; \
+ *_p = (*_w>>8) & 1; _p++; \
+ *_p = (*_w>>9) & 1; _p++; \
+ *_p = (*_w>>10) & 1; _p++; \
+ *_p = (*_w>>11) & 1; _p++; \
+ *_p = (*_w>>12) & 1; _p++; \
+ *_p = (*_w>>13) & 1; _p++; \
+ *_p = (*_w>>14) & 1; _p++; \
+ *_p = (*_w>>15) & 1; _p++; \
+ *_p = (*_w>>16) & 1; _p++; \
+ *_p = (*_w>>17) & 1; _p++; \
+ *_p = (*_w>>18) & 1; _p++; \
+ *_p = (*_w>>19) & 1; _p++; \
+ *_p = (*_w>>20) & 1; _p++; \
+ *_p = (*_w>>21) & 1; _p++; \
+ *_p = (*_w>>22) & 1; _p++; \
+ *_p = (*_w>>23) & 1; _p++; \
+ *_p = (*_w>>24) & 1; _p++; \
+ *_p = (*_w>>25) & 1; _p++; \
+ *_p = (*_w>>26) & 1; _p++; \
+ *_p = (*_w>>27) & 1; _p++; \
+ break; \
+ case 1: \
+ *_p = (*_w) & 3; _p++; \
+ *_p = (*_w>>2) & 3; _p++; \
+ *_p = (*_w>>4) & 3; _p++; \
+ *_p = (*_w>>6) & 3; _p++; \
+ *_p = (*_w>>8) & 3; _p++; \
+ *_p = (*_w>>10) & 3; _p++; \
+ *_p = (*_w>>12) & 3; _p++; \
+ *_p = (*_w>>14) & 1; _p++; \
+ *_p = (*_w>>15) & 1; _p++; \
+ *_p = (*_w>>16) & 1; _p++; \
+ *_p = (*_w>>17) & 1; _p++; \
+ *_p = (*_w>>18) & 1; _p++; \
+ *_p = (*_w>>19) & 1; _p++; \
+ *_p = (*_w>>20) & 1; _p++; \
+ *_p = (*_w>>21) & 1; _p++; \
+ *_p = (*_w>>22) & 1; _p++; \
+ *_p = (*_w>>23) & 1; _p++; \
+ *_p = (*_w>>24) & 1; _p++; \
+ *_p = (*_w>>25) & 1; _p++; \
+ *_p = (*_w>>26) & 1; _p++; \
+ *_p = (*_w>>27) & 1; _p++; \
+ break; \
+ case 2: \
+ *_p = (*_w) & 1; _p++; \
+ *_p = (*_w>>1) & 1; _p++; \
+ *_p = (*_w>>2) & 1; _p++; \
+ *_p = (*_w>>3) & 1; _p++; \
+ *_p = (*_w>>4) & 1; _p++; \
+ *_p = (*_w>>5) & 1; _p++; \
+ *_p = (*_w>>6) & 1; _p++; \
+ *_p = (*_w>>7) & 3; _p++; \
+ *_p = (*_w>>9) & 3; _p++; \
+ *_p = (*_w>>11) & 3; _p++; \
+ *_p = (*_w>>13) & 3; _p++; \
+ *_p = (*_w>>15) & 3; _p++; \
+ *_p = (*_w>>17) & 3; _p++; \
+ *_p = (*_w>>19) & 3; _p++; \
+ *_p = (*_w>>21) & 1; _p++; \
+ *_p = (*_w>>22) & 1; _p++; \
+ *_p = (*_w>>23) & 1; _p++; \
+ *_p = (*_w>>24) & 1; _p++; \
+ *_p = (*_w>>25) & 1; _p++; \
+ *_p = (*_w>>26) & 1; _p++; \
+ *_p = (*_w>>27) & 1; _p++; \
+ break; \
+ case 3: \
+ *_p = (*_w) & 1; _p++; \
+ *_p = (*_w>>1) & 1; _p++; \
+ *_p = (*_w>>2) & 1; _p++; \
+ *_p = (*_w>>3) & 1; _p++; \
+ *_p = (*_w>>4) & 1; _p++; \
+ *_p = (*_w>>5) & 1; _p++; \
+ *_p = (*_w>>6) & 1; _p++; \
+ *_p = (*_w>>7) & 1; _p++; \
+ *_p = (*_w>>8) & 1; _p++; \
+ *_p = (*_w>>9) & 1; _p++; \
+ *_p = (*_w>>10) & 1; _p++; \
+ *_p = (*_w>>11) & 1; _p++; \
+ *_p = (*_w>>12) & 1; _p++; \
+ *_p = (*_w>>13) & 1; _p++; \
+ *_p = (*_w>>14) & 3; _p++; \
+ *_p = (*_w>>16) & 3; _p++; \
+ *_p = (*_w>>18) & 3; _p++; \
+ *_p = (*_w>>20) & 3; _p++; \
+ *_p = (*_w>>22) & 3; _p++; \
+ *_p = (*_w>>24) & 3; _p++; \
+ *_p = (*_w>>26) & 3; _p++; \
+ break; \
+ case 4: \
+ *_p = (*_w) & 3; _p++; \
+ *_p = (*_w>>2) & 3; _p++; \
+ *_p = (*_w>>4) & 3; _p++; \
+ *_p = (*_w>>6) & 3; _p++; \
+ *_p = (*_w>>8) & 3; _p++; \
+ *_p = (*_w>>10) & 3; _p++; \
+ *_p = (*_w>>12) & 3; _p++; \
+ *_p = (*_w>>14) & 3; _p++; \
+ *_p = (*_w>>16) & 3; _p++; \
+ *_p = (*_w>>18) & 3; _p++; \
+ *_p = (*_w>>20) & 3; _p++; \
+ *_p = (*_w>>22) & 3; _p++; \
+ *_p = (*_w>>24) & 3; _p++; \
+ *_p = (*_w>>26) & 3; _p++; \
+ break; \
+ case 5: \
+ *_p = (*_w) & 15; _p++; \
+ *_p = (*_w>>4) & 7; _p++; \
+ *_p = (*_w>>7) & 7; _p++; \
+ *_p = (*_w>>10) & 7; _p++; \
+ *_p = (*_w>>13) & 7; _p++; \
+ *_p = (*_w>>16) & 7; _p++; \
+ *_p = (*_w>>19) & 7; _p++; \
+ *_p = (*_w>>22) & 7; _p++; \
+ *_p = (*_w>>25) & 7; _p++; \
+ break; \
+ case 6: \
+ *_p = (*_w) & 7; _p++; \
+ *_p = (*_w>>3) & 15; _p++; \
+ *_p = (*_w>>7) & 15; _p++; \
+ *_p = (*_w>>11) & 15; _p++; \
+ *_p = (*_w>>15) & 15; _p++; \
+ *_p = (*_w>>19) & 7; _p++; \
+ *_p = (*_w>>22) & 7; _p++; \
+ *_p = (*_w>>25) & 7; _p++; \
+ break; \
+ case 7: \
+ *_p = (*_w) & 15; _p++; \
+ *_p = (*_w>>4) & 15; _p++; \
+ *_p = (*_w>>8) & 15; _p++; \
+ *_p = (*_w>>12) & 15; _p++; \
+ *_p = (*_w>>16) & 15; _p++; \
+ *_p = (*_w>>20) & 15; _p++; \
+ *_p = (*_w>>24) & 15; _p++; \
+ break; \
+ case 8: \
+ *_p = (*_w) & 31; _p++; \
+ *_p = (*_w>>5) & 31; _p++; \
+ *_p = (*_w>>10) & 31; _p++; \
+ *_p = (*_w>>15) & 31; _p++; \
+ *_p = (*_w>>20) & 15; _p++; \
+ *_p = (*_w>>24) & 15; _p++; \
+ break; \
+ case 9: \
+ *_p = (*_w) & 15; _p++; \
+ *_p = (*_w>>4) & 15; _p++; \
+ *_p = (*_w>>8) & 31; _p++; \
+ *_p = (*_w>>13) & 31; _p++; \
+ *_p = (*_w>>18) & 31; _p++; \
+ *_p = (*_w>>23) & 31; _p++; \
+ break; \
+ case 10: \
+ *_p = (*_w) & 63; _p++; \
+ *_p = (*_w>>6) & 63; _p++; \
+ *_p = (*_w>>12) & 63; _p++; \
+ *_p = (*_w>>18) & 31; _p++; \
+ *_p = (*_w>>23) & 31; _p++; \
+ break; \
+ case 11: \
+ *_p = (*_w) & 31; _p++; \
+ *_p = (*_w>>5) & 31; _p++; \
+ *_p = (*_w>>10) & 63; _p++; \
+ *_p = (*_w>>16) & 63; _p++; \
+ *_p = (*_w>>22) & 63; _p++; \
+ break; \
+ case 12: \
+ *_p = (*_w) & 127; _p++; \
+ *_p = (*_w>>7) & 127; _p++; \
+ *_p = (*_w>>14) & 127; _p++; \
+ *_p = (*_w>>21) & 127; _p++; \
+ break; \
+ case 13: \
+ *_p = (*_w) & 1023; _p++; \
+ *_p = (*_w>>10) & 511; _p++; \
+ *_p = (*_w>>19) & 511; _p++; \
+ break; \
+ case 14: \
+ *_p = (*_w) & 16383; _p++; \
+ *_p = (*_w>>14) & 16383; _p++; \
+ break; \
+ case 15: \
+ *_p = (*_w) & ((1<<28)-1); _p++; \
+ break; \
+ }\
+ _w++; \
+}
+
+
+
+
+
diff --git a/ext/OPT_PFD/unpack.h b/ext/OPT_PFD/unpack.h
new file mode 100644
index 0000000..fa810e9
--- /dev/null
+++ b/ext/OPT_PFD/unpack.h
@@ -0,0 +1,773 @@
+
+/*************************************************************/
+/* macros for fast unpacking of integers of fixed bit length */
+/*************************************************************/
+
+#define BS 128
+
+/* supported bit lengths */
+int cnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32};
+
+void unpack0(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i++) p[i] = 0;
+}
+
+
+void unpack1(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 1)
+ {
+ p[0] = (w[0] >> 31);
+ p[1] = (w[0] >> 30) & 1;
+ p[2] = (w[0] >> 29) & 1;
+ p[3] = (w[0] >> 28) & 1;
+ p[4] = (w[0] >> 27) & 1;
+ p[5] = (w[0] >> 26) & 1;
+ p[6] = (w[0] >> 25) & 1;
+ p[7] = (w[0] >> 24) & 1;
+ p[8] = (w[0] >> 23) & 1;
+ p[9] = (w[0] >> 22) & 1;
+ p[10] = (w[0] >> 21) & 1;
+ p[11] = (w[0] >> 20) & 1;
+ p[12] = (w[0] >> 19) & 1;
+ p[13] = (w[0] >> 18) & 1;
+ p[14] = (w[0] >> 17) & 1;
+ p[15] = (w[0] >> 16) & 1;
+ p[16] = (w[0] >> 15) & 1;
+ p[17] = (w[0] >> 14) & 1;
+ p[18] = (w[0] >> 13) & 1;
+ p[19] = (w[0] >> 12) & 1;
+ p[20] = (w[0] >> 11) & 1;
+ p[21] = (w[0] >> 10) & 1;
+ p[22] = (w[0] >> 9) & 1;
+ p[23] = (w[0] >> 8) & 1;
+ p[24] = (w[0] >> 7) & 1;
+ p[25] = (w[0] >> 6) & 1;
+ p[26] = (w[0] >> 5) & 1;
+ p[27] = (w[0] >> 4) & 1;
+ p[28] = (w[0] >> 3) & 1;
+ p[29] = (w[0] >> 2) & 1;
+ p[30] = (w[0] >> 1) & 1;
+ p[31] = (w[0]) & 1;
+ }
+}
+
+
+void unpack2(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 2)
+ {
+ p[0] = (w[0] >> 30);
+ p[1] = (w[0] >> 28) & 3;
+ p[2] = (w[0] >> 26) & 3;
+ p[3] = (w[0] >> 24) & 3;
+ p[4] = (w[0] >> 22) & 3;
+ p[5] = (w[0] >> 20) & 3;
+ p[6] = (w[0] >> 18) & 3;
+ p[7] = (w[0] >> 16) & 3;
+ p[8] = (w[0] >> 14) & 3;
+ p[9] = (w[0] >> 12) & 3;
+ p[10] = (w[0] >> 10) & 3;
+ p[11] = (w[0] >> 8) & 3;
+ p[12] = (w[0] >> 6) & 3;
+ p[13] = (w[0] >> 4) & 3;
+ p[14] = (w[0] >> 2) & 3;
+ p[15] = (w[0]) & 3;
+ p[16] = (w[1] >> 30);
+ p[17] = (w[1] >> 28) & 3;
+ p[18] = (w[1] >> 26) & 3;
+ p[19] = (w[1] >> 24) & 3;
+ p[20] = (w[1] >> 22) & 3;
+ p[21] = (w[1] >> 20) & 3;
+ p[22] = (w[1] >> 18) & 3;
+ p[23] = (w[1] >> 16) & 3;
+ p[24] = (w[1] >> 14) & 3;
+ p[25] = (w[1] >> 12) & 3;
+ p[26] = (w[1] >> 10) & 3;
+ p[27] = (w[1] >> 8) & 3;
+ p[28] = (w[1] >> 6) & 3;
+ p[29] = (w[1] >> 4) & 3;
+ p[30] = (w[1] >> 2) & 3;
+ p[31] = (w[1]) & 3;
+ }
+}
+
+
+void unpack3(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 3)
+ {
+ p[0] = (w[0] >> 29);
+ p[1] = (w[0] >> 26) & 7;
+ p[2] = (w[0] >> 23) & 7;
+ p[3] = (w[0] >> 20) & 7;
+ p[4] = (w[0] >> 17) & 7;
+ p[5] = (w[0] >> 14) & 7;
+ p[6] = (w[0] >> 11) & 7;
+ p[7] = (w[0] >> 8) & 7;
+ p[8] = (w[0] >> 5) & 7;
+ p[9] = (w[0] >> 2) & 7;
+ p[10] = (w[0] << 1) & 7;
+ p[10] |= (w[1] >> 31);
+ p[11] = (w[1] >> 28) & 7;
+ p[12] = (w[1] >> 25) & 7;
+ p[13] = (w[1] >> 22) & 7;
+ p[14] = (w[1] >> 19) & 7;
+ p[15] = (w[1] >> 16) & 7;
+ p[16] = (w[1] >> 13) & 7;
+ p[17] = (w[1] >> 10) & 7;
+ p[18] = (w[1] >> 7) & 7;
+ p[19] = (w[1] >> 4) & 7;
+ p[20] = (w[1] >> 1) & 7;
+ p[21] = (w[1] << 2) & 7;
+ p[21] |= (w[2] >> 30);
+ p[22] = (w[2] >> 27) & 7;
+ p[23] = (w[2] >> 24) & 7;
+ p[24] = (w[2] >> 21) & 7;
+ p[25] = (w[2] >> 18) & 7;
+ p[26] = (w[2] >> 15) & 7;
+ p[27] = (w[2] >> 12) & 7;
+ p[28] = (w[2] >> 9) & 7;
+ p[29] = (w[2] >> 6) & 7;
+ p[30] = (w[2] >> 3) & 7;
+ p[31] = (w[2]) & 7;
+ }
+}
+
+
+void unpack4(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 4)
+ {
+ p[0] = (w[0] >> 28);
+ p[1] = (w[0] >> 24) & 15;
+ p[2] = (w[0] >> 20) & 15;
+ p[3] = (w[0] >> 16) & 15;
+ p[4] = (w[0] >> 12) & 15;
+ p[5] = (w[0] >> 8) & 15;
+ p[6] = (w[0] >> 4) & 15;
+ p[7] = (w[0]) & 15;
+ p[8] = (w[1] >> 28);
+ p[9] = (w[1] >> 24) & 15;
+ p[10] = (w[1] >> 20) & 15;
+ p[11] = (w[1] >> 16) & 15;
+ p[12] = (w[1] >> 12) & 15;
+ p[13] = (w[1] >> 8) & 15;
+ p[14] = (w[1] >> 4) & 15;
+ p[15] = (w[1]) & 15;
+ p[16] = (w[2] >> 28);
+ p[17] = (w[2] >> 24) & 15;
+ p[18] = (w[2] >> 20) & 15;
+ p[19] = (w[2] >> 16) & 15;
+ p[20] = (w[2] >> 12) & 15;
+ p[21] = (w[2] >> 8) & 15;
+ p[22] = (w[2] >> 4) & 15;
+ p[23] = (w[2]) & 15;
+ p[24] = (w[3] >> 28);
+ p[25] = (w[3] >> 24) & 15;
+ p[26] = (w[3] >> 20) & 15;
+ p[27] = (w[3] >> 16) & 15;
+ p[28] = (w[3] >> 12) & 15;
+ p[29] = (w[3] >> 8) & 15;
+ p[30] = (w[3] >> 4) & 15;
+ p[31] = (w[3]) & 15;
+ }
+}
+
+
+void unpack5(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 5)
+ {
+ p[0] = (w[0] >> 27);
+ p[1] = (w[0] >> 22) & 31;
+ p[2] = (w[0] >> 17) & 31;
+ p[3] = (w[0] >> 12) & 31;
+ p[4] = (w[0] >> 7) & 31;
+ p[5] = (w[0] >> 2) & 31;
+ p[6] = (w[0] << 3) & 31;
+ p[6] |= (w[1] >> 29);
+ p[7] = (w[1] >> 24) & 31;
+ p[8] = (w[1] >> 19) & 31;
+ p[9] = (w[1] >> 14) & 31;
+ p[10] = (w[1] >> 9) & 31;
+ p[11] = (w[1] >> 4) & 31;
+ p[12] = (w[1] << 1) & 31;
+ p[12] |= (w[2] >> 31);
+ p[13] = (w[2] >> 26) & 31;
+ p[14] = (w[2] >> 21) & 31;
+ p[15] = (w[2] >> 16) & 31;
+ p[16] = (w[2] >> 11) & 31;
+ p[17] = (w[2] >> 6) & 31;
+ p[18] = (w[2] >> 1) & 31;
+ p[19] = (w[2] << 4) & 31;
+ p[19] |= (w[3] >> 28);
+ p[20] = (w[3] >> 23) & 31;
+ p[21] = (w[3] >> 18) & 31;
+ p[22] = (w[3] >> 13) & 31;
+ p[23] = (w[3] >> 8) & 31;
+ p[24] = (w[3] >> 3) & 31;
+ p[25] = (w[3] << 2) & 31;
+ p[25] |= (w[4] >> 30);
+ p[26] = (w[4] >> 25) & 31;
+ p[27] = (w[4] >> 20) & 31;
+ p[28] = (w[4] >> 15) & 31;
+ p[29] = (w[4] >> 10) & 31;
+ p[30] = (w[4] >> 5) & 31;
+ p[31] = (w[4]) & 31;
+ }
+}
+
+
+void unpack6(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 6)
+ {
+ p[0] = (w[0] >> 26);
+ p[1] = (w[0] >> 20) & 63;
+ p[2] = (w[0] >> 14) & 63;
+ p[3] = (w[0] >> 8) & 63;
+ p[4] = (w[0] >> 2) & 63;
+ p[5] = (w[0] << 4) & 63;
+ p[5] |= (w[1] >> 28);
+ p[6] = (w[1] >> 22) & 63;
+ p[7] = (w[1] >> 16) & 63;
+ p[8] = (w[1] >> 10) & 63;
+ p[9] = (w[1] >> 4) & 63;
+ p[10] = (w[1] << 2) & 63;
+ p[10] |= (w[2] >> 30);
+ p[11] = (w[2] >> 24) & 63;
+ p[12] = (w[2] >> 18) & 63;
+ p[13] = (w[2] >> 12) & 63;
+ p[14] = (w[2] >> 6) & 63;
+ p[15] = (w[2]) & 63;
+ p[16] = (w[3] >> 26);
+ p[17] = (w[3] >> 20) & 63;
+ p[18] = (w[3] >> 14) & 63;
+ p[19] = (w[3] >> 8) & 63;
+ p[20] = (w[3] >> 2) & 63;
+ p[21] = (w[3] << 4) & 63;
+ p[21] |= (w[4] >> 28);
+ p[22] = (w[4] >> 22) & 63;
+ p[23] = (w[4] >> 16) & 63;
+ p[24] = (w[4] >> 10) & 63;
+ p[25] = (w[4] >> 4) & 63;
+ p[26] = (w[4] << 2) & 63;
+ p[26] |= (w[5] >> 30);
+ p[27] = (w[5] >> 24) & 63;
+ p[28] = (w[5] >> 18) & 63;
+ p[29] = (w[5] >> 12) & 63;
+ p[30] = (w[5] >> 6) & 63;
+ p[31] = (w[5]) & 63;
+ }
+}
+
+
+void unpack7(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 7)
+ {
+ p[0] = (w[0] >> 25);
+ p[1] = (w[0] >> 18) & 127;
+ p[2] = (w[0] >> 11) & 127;
+ p[3] = (w[0] >> 4) & 127;
+ p[4] = (w[0] << 3) & 127;
+ p[4] |= (w[1] >> 29);
+ p[5] = (w[1] >> 22) & 127;
+ p[6] = (w[1] >> 15) & 127;
+ p[7] = (w[1] >> 8) & 127;
+ p[8] = (w[1] >> 1) & 127;
+ p[9] = (w[1] << 6) & 127;
+ p[9] |= (w[2] >> 26);
+ p[10] = (w[2] >> 19) & 127;
+ p[11] = (w[2] >> 12) & 127;
+ p[12] = (w[2] >> 5) & 127;
+ p[13] = (w[2] << 2) & 127;
+ p[13] |= (w[3] >> 30);
+ p[14] = (w[3] >> 23) & 127;
+ p[15] = (w[3] >> 16) & 127;
+ p[16] = (w[3] >> 9) & 127;
+ p[17] = (w[3] >> 2) & 127;
+ p[18] = (w[3] << 5) & 127;
+ p[18] |= (w[4] >> 27);
+ p[19] = (w[4] >> 20) & 127;
+ p[20] = (w[4] >> 13) & 127;
+ p[21] = (w[4] >> 6) & 127;
+ p[22] = (w[4] << 1) & 127;
+ p[22] |= (w[5] >> 31);
+ p[23] = (w[5] >> 24) & 127;
+ p[24] = (w[5] >> 17) & 127;
+ p[25] = (w[5] >> 10) & 127;
+ p[26] = (w[5] >> 3) & 127;
+ p[27] = (w[5] << 4) & 127;
+ p[27] |= (w[6] >> 28);
+ p[28] = (w[6] >> 21) & 127;
+ p[29] = (w[6] >> 14) & 127;
+ p[30] = (w[6] >> 7) & 127;
+ p[31] = (w[6]) & 127;
+ }
+}
+
+
+void unpack8(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 8)
+ {
+ p[0] = (w[0] >> 24);
+ p[1] = (w[0] >> 16) & 255;
+ p[2] = (w[0] >> 8) & 255;
+ p[3] = (w[0]) & 255;
+ p[4] = (w[1] >> 24);
+ p[5] = (w[1] >> 16) & 255;
+ p[6] = (w[1] >> 8) & 255;
+ p[7] = (w[1]) & 255;
+ p[8] = (w[2] >> 24);
+ p[9] = (w[2] >> 16) & 255;
+ p[10] = (w[2] >> 8) & 255;
+ p[11] = (w[2]) & 255;
+ p[12] = (w[3] >> 24);
+ p[13] = (w[3] >> 16) & 255;
+ p[14] = (w[3] >> 8) & 255;
+ p[15] = (w[3]) & 255;
+ p[16] = (w[4] >> 24);
+ p[17] = (w[4] >> 16) & 255;
+ p[18] = (w[4] >> 8) & 255;
+ p[19] = (w[4]) & 255;
+ p[20] = (w[5] >> 24);
+ p[21] = (w[5] >> 16) & 255;
+ p[22] = (w[5] >> 8) & 255;
+ p[23] = (w[5]) & 255;
+ p[24] = (w[6] >> 24);
+ p[25] = (w[6] >> 16) & 255;
+ p[26] = (w[6] >> 8) & 255;
+ p[27] = (w[6]) & 255;
+ p[28] = (w[7] >> 24);
+ p[29] = (w[7] >> 16) & 255;
+ p[30] = (w[7] >> 8) & 255;
+ p[31] = (w[7]) & 255;
+ }
+}
+
+
+void unpack9(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 9)
+ {
+ p[0] = (w[0] >> 23);
+ p[1] = (w[0] >> 14) & 511;
+ p[2] = (w[0] >> 5) & 511;
+ p[3] = (w[0] << 4) & 511;
+ p[3] |= (w[1] >> 28);
+ p[4] = (w[1] >> 19) & 511;
+ p[5] = (w[1] >> 10) & 511;
+ p[6] = (w[1] >> 1) & 511;
+ p[7] = (w[1] << 8) & 511;
+ p[7] |= (w[2] >> 24);
+ p[8] = (w[2] >> 15) & 511;
+ p[9] = (w[2] >> 6) & 511;
+ p[10] = (w[2] << 3) & 511;
+ p[10] |= (w[3] >> 29);
+ p[11] = (w[3] >> 20) & 511;
+ p[12] = (w[3] >> 11) & 511;
+ p[13] = (w[3] >> 2) & 511;
+ p[14] = (w[3] << 7) & 511;
+ p[14] |= (w[4] >> 25);
+ p[15] = (w[4] >> 16) & 511;
+ p[16] = (w[4] >> 7) & 511;
+ p[17] = (w[4] << 2) & 511;
+ p[17] |= (w[5] >> 30);
+ p[18] = (w[5] >> 21) & 511;
+ p[19] = (w[5] >> 12) & 511;
+ p[20] = (w[5] >> 3) & 511;
+ p[21] = (w[5] << 6) & 511;
+ p[21] |= (w[6] >> 26);
+ p[22] = (w[6] >> 17) & 511;
+ p[23] = (w[6] >> 8) & 511;
+ p[24] = (w[6] << 1) & 511;
+ p[24] |= (w[7] >> 31);
+ p[25] = (w[7] >> 22) & 511;
+ p[26] = (w[7] >> 13) & 511;
+ p[27] = (w[7] >> 4) & 511;
+ p[28] = (w[7] << 5) & 511;
+ p[28] |= (w[8] >> 27);
+ p[29] = (w[8] >> 18) & 511;
+ p[30] = (w[8] >> 9) & 511;
+ p[31] = (w[8]) & 511;
+ }
+}
+
+
+void unpack10(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 10)
+ {
+ p[0] = (w[0] >> 22);
+ p[1] = (w[0] >> 12) & 1023;
+ p[2] = (w[0] >> 2) & 1023;
+ p[3] = (w[0] << 8) & 1023;
+ p[3] |= (w[1] >> 24);
+ p[4] = (w[1] >> 14) & 1023;
+ p[5] = (w[1] >> 4) & 1023;
+ p[6] = (w[1] << 6) & 1023;
+ p[6] |= (w[2] >> 26);
+ p[7] = (w[2] >> 16) & 1023;
+ p[8] = (w[2] >> 6) & 1023;
+ p[9] = (w[2] << 4) & 1023;
+ p[9] |= (w[3] >> 28);
+ p[10] = (w[3] >> 18) & 1023;
+ p[11] = (w[3] >> 8) & 1023;
+ p[12] = (w[3] << 2) & 1023;
+ p[12] |= (w[4] >> 30);
+ p[13] = (w[4] >> 20) & 1023;
+ p[14] = (w[4] >> 10) & 1023;
+ p[15] = (w[4]) & 1023;
+ p[16] = (w[5] >> 22);
+ p[17] = (w[5] >> 12) & 1023;
+ p[18] = (w[5] >> 2) & 1023;
+ p[19] = (w[5] << 8) & 1023;
+ p[19] |= (w[6] >> 24);
+ p[20] = (w[6] >> 14) & 1023;
+ p[21] = (w[6] >> 4) & 1023;
+ p[22] = (w[6] << 6) & 1023;
+ p[22] |= (w[7] >> 26);
+ p[23] = (w[7] >> 16) & 1023;
+ p[24] = (w[7] >> 6) & 1023;
+ p[25] = (w[7] << 4) & 1023;
+ p[25] |= (w[8] >> 28);
+ p[26] = (w[8] >> 18) & 1023;
+ p[27] = (w[8] >> 8) & 1023;
+ p[28] = (w[8] << 2) & 1023;
+ p[28] |= (w[9] >> 30);
+ p[29] = (w[9] >> 20) & 1023;
+ p[30] = (w[9] >> 10) & 1023;
+ p[31] = (w[9]) & 1023;
+ }
+}
+
+
+void unpack11(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 11)
+ {
+ p[0] = (w[0] >> 21);
+ p[1] = (w[0] >> 10) & 2047;
+ p[2] = (w[0] << 1) & 2047;
+ p[2] |= (w[1] >> 31);
+ p[3] = (w[1] >> 20) & 2047;
+ p[4] = (w[1] >> 9) & 2047;
+ p[5] = (w[1] << 2) & 2047;
+ p[5] |= (w[2] >> 30);
+ p[6] = (w[2] >> 19) & 2047;
+ p[7] = (w[2] >> 8) & 2047;
+ p[8] = (w[2] << 3) & 2047;
+ p[8] |= (w[3] >> 29);
+ p[9] = (w[3] >> 18) & 2047;
+ p[10] = (w[3] >> 7) & 2047;
+ p[11] = (w[3] << 4) & 2047;
+ p[11] |= (w[4] >> 28);
+ p[12] = (w[4] >> 17) & 2047;
+ p[13] = (w[4] >> 6) & 2047;
+ p[14] = (w[4] << 5) & 2047;
+ p[14] |= (w[5] >> 27);
+ p[15] = (w[5] >> 16) & 2047;
+ p[16] = (w[5] >> 5) & 2047;
+ p[17] = (w[5] << 6) & 2047;
+ p[17] |= (w[6] >> 26);
+ p[18] = (w[6] >> 15) & 2047;
+ p[19] = (w[6] >> 4) & 2047;
+ p[20] = (w[6] << 7) & 2047;
+ p[20] |= (w[7] >> 25);
+ p[21] = (w[7] >> 14) & 2047;
+ p[22] = (w[7] >> 3) & 2047;
+ p[23] = (w[7] << 8) & 2047;
+ p[23] |= (w[8] >> 24);
+ p[24] = (w[8] >> 13) & 2047;
+ p[25] = (w[8] >> 2) & 2047;
+ p[26] = (w[8] << 9) & 2047;
+ p[26] |= (w[9] >> 23);
+ p[27] = (w[9] >> 12) & 2047;
+ p[28] = (w[9] >> 1) & 2047;
+ p[29] = (w[9] << 10) & 2047;
+ p[29] |= (w[10] >> 22);
+ p[30] = (w[10] >> 11) & 2047;
+ p[31] = (w[10]) & 2047;
+ }
+}
+
+
+void unpack12(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 12)
+ {
+ p[0] = (w[0] >> 20);
+ p[1] = (w[0] >> 8) & 4095;
+ p[2] = (w[0] << 4) & 4095;
+ p[2] |= (w[1] >> 28);
+ p[3] = (w[1] >> 16) & 4095;
+ p[4] = (w[1] >> 4) & 4095;
+ p[5] = (w[1] << 8) & 4095;
+ p[5] |= (w[2] >> 24);
+ p[6] = (w[2] >> 12) & 4095;
+ p[7] = (w[2]) & 4095;
+ p[8] = (w[3] >> 20);
+ p[9] = (w[3] >> 8) & 4095;
+ p[10] = (w[3] << 4) & 4095;
+ p[10] |= (w[4] >> 28);
+ p[11] = (w[4] >> 16) & 4095;
+ p[12] = (w[4] >> 4) & 4095;
+ p[13] = (w[4] << 8) & 4095;
+ p[13] |= (w[5] >> 24);
+ p[14] = (w[5] >> 12) & 4095;
+ p[15] = (w[5]) & 4095;
+ p[16] = (w[6] >> 20);
+ p[17] = (w[6] >> 8) & 4095;
+ p[18] = (w[6] << 4) & 4095;
+ p[18] |= (w[7] >> 28);
+ p[19] = (w[7] >> 16) & 4095;
+ p[20] = (w[7] >> 4) & 4095;
+ p[21] = (w[7] << 8) & 4095;
+ p[21] |= (w[8] >> 24);
+ p[22] = (w[8] >> 12) & 4095;
+ p[23] = (w[8]) & 4095;
+ p[24] = (w[9] >> 20);
+ p[25] = (w[9] >> 8) & 4095;
+ p[26] = (w[9] << 4) & 4095;
+ p[26] |= (w[10] >> 28);
+ p[27] = (w[10] >> 16) & 4095;
+ p[28] = (w[10] >> 4) & 4095;
+ p[29] = (w[10] << 8) & 4095;
+ p[29] |= (w[11] >> 24);
+ p[30] = (w[11] >> 12) & 4095;
+ p[31] = (w[11]) & 4095;
+ }
+}
+
+
+void unpack13(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 13)
+ {
+ p[0] = (w[0] >> 19);
+ p[1] = (w[0] >> 6) & 8191;
+ p[2] = (w[0] << 7) & 8191;
+ p[2] |= (w[1] >> 25);
+ p[3] = (w[1] >> 12) & 8191;
+ p[4] = (w[1] << 1) & 8191;
+ p[4] |= (w[2] >> 31);
+ p[5] = (w[2] >> 18) & 8191;
+ p[6] = (w[2] >> 5) & 8191;
+ p[7] = (w[2] << 8) & 8191;
+ p[7] |= (w[3] >> 24);
+ p[8] = (w[3] >> 11) & 8191;
+ p[9] = (w[3] << 2) & 8191;
+ p[9] |= (w[4] >> 30);
+ p[10] = (w[4] >> 17) & 8191;
+ p[11] = (w[4] >> 4) & 8191;
+ p[12] = (w[4] << 9) & 8191;
+ p[12] |= (w[5] >> 23);
+ p[13] = (w[5] >> 10) & 8191;
+ p[14] = (w[5] << 3) & 8191;
+ p[14] |= (w[6] >> 29);
+ p[15] = (w[6] >> 16) & 8191;
+ p[16] = (w[6] >> 3) & 8191;
+ p[17] = (w[6] << 10) & 8191;
+ p[17] |= (w[7] >> 22);
+ p[18] = (w[7] >> 9) & 8191;
+ p[19] = (w[7] << 4) & 8191;
+ p[19] |= (w[8] >> 28);
+ p[20] = (w[8] >> 15) & 8191;
+ p[21] = (w[8] >> 2) & 8191;
+ p[22] = (w[8] << 11) & 8191;
+ p[22] |= (w[9] >> 21);
+ p[23] = (w[9] >> 8) & 8191;
+ p[24] = (w[9] << 5) & 8191;
+ p[24] |= (w[10] >> 27);
+ p[25] = (w[10] >> 14) & 8191;
+ p[26] = (w[10] >> 1) & 8191;
+ p[27] = (w[10] << 12) & 8191;
+ p[27] |= (w[11] >> 20);
+ p[28] = (w[11] >> 7) & 8191;
+ p[29] = (w[11] << 6) & 8191;
+ p[29] |= (w[12] >> 26);
+ p[30] = (w[12] >> 13) & 8191;
+ p[31] = (w[12]) & 8191;
+ }
+}
+
+
+void unpack16(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 16)
+ {
+ p[0] = (w[0] >> 16);
+ p[1] = (w[0]) & 65535;
+ p[2] = (w[1] >> 16);
+ p[3] = (w[1]) & 65535;
+ p[4] = (w[2] >> 16);
+ p[5] = (w[2]) & 65535;
+ p[6] = (w[3] >> 16);
+ p[7] = (w[3]) & 65535;
+ p[8] = (w[4] >> 16);
+ p[9] = (w[4]) & 65535;
+ p[10] = (w[5] >> 16);
+ p[11] = (w[5]) & 65535;
+ p[12] = (w[6] >> 16);
+ p[13] = (w[6]) & 65535;
+ p[14] = (w[7] >> 16);
+ p[15] = (w[7]) & 65535;
+ p[16] = (w[8] >> 16);
+ p[17] = (w[8]) & 65535;
+ p[18] = (w[9] >> 16);
+ p[19] = (w[9]) & 65535;
+ p[20] = (w[10] >> 16);
+ p[21] = (w[10]) & 65535;
+ p[22] = (w[11] >> 16);
+ p[23] = (w[11]) & 65535;
+ p[24] = (w[12] >> 16);
+ p[25] = (w[12]) & 65535;
+ p[26] = (w[13] >> 16);
+ p[27] = (w[13]) & 65535;
+ p[28] = (w[14] >> 16);
+ p[29] = (w[14]) & 65535;
+ p[30] = (w[15] >> 16);
+ p[31] = (w[15]) & 65535;
+ }
+}
+
+
+void unpack20(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 20)
+ {
+ p[0] = (w[0] >> 12);
+ p[1] = (w[0] << 8) & ((1<<20)-1);
+ p[1] |= (w[1] >> 24);
+ p[2] = (w[1] >> 4) & ((1<<20)-1);
+ p[3] = (w[1] << 16) & ((1<<20)-1);
+ p[3] |= (w[2] >> 16);
+ p[4] = (w[2] << 4) & ((1<<20)-1);
+ p[4] |= (w[3] >> 28);
+ p[5] = (w[3] >> 8) & ((1<<20)-1);
+ p[6] = (w[3] << 12) & ((1<<20)-1);
+ p[6] |= (w[4] >> 20);
+ p[7] = (w[4]) & ((1<<20)-1);
+ p[8] = (w[5] >> 12);
+ p[9] = (w[5] << 8) & ((1<<20)-1);
+ p[9] |= (w[6] >> 24);
+ p[10] = (w[6] >> 4) & ((1<<20)-1);
+ p[11] = (w[6] << 16) & ((1<<20)-1);
+ p[11] |= (w[7] >> 16);
+ p[12] = (w[7] << 4) & ((1<<20)-1);
+ p[12] |= (w[8] >> 28);
+ p[13] = (w[8] >> 8) & ((1<<20)-1);
+ p[14] = (w[8] << 12) & ((1<<20)-1);
+ p[14] |= (w[9] >> 20);
+ p[15] = (w[9]) & ((1<<20)-1);
+ p[16] = (w[10] >> 12);
+ p[17] = (w[10] << 8) & ((1<<20)-1);
+ p[17] |= (w[11] >> 24);
+ p[18] = (w[11] >> 4) & ((1<<20)-1);
+ p[19] = (w[11] << 16) & ((1<<20)-1);
+ p[19] |= (w[12] >> 16);
+ p[20] = (w[12] << 4) & ((1<<20)-1);
+ p[20] |= (w[13] >> 28);
+ p[21] = (w[13] >> 8) & ((1<<20)-1);
+ p[22] = (w[13] << 12) & ((1<<20)-1);
+ p[22] |= (w[14] >> 20);
+ p[23] = (w[14]) & ((1<<20)-1);
+ p[24] = (w[15] >> 12);
+ p[25] = (w[15] << 8) & ((1<<20)-1);
+ p[25] |= (w[16] >> 24);
+ p[26] = (w[16] >> 4) & ((1<<20)-1);
+ p[27] = (w[16] << 16) & ((1<<20)-1);
+ p[27] |= (w[17] >> 16);
+ p[28] = (w[17] << 4) & ((1<<20)-1);
+ p[28] |= (w[18] >> 28);
+ p[29] = (w[18] >> 8) & ((1<<20)-1);
+ p[30] = (w[18] << 12) & ((1<<20)-1);
+ p[30] |= (w[19] >> 20);
+ p[31] = (w[19]) & ((1<<20)-1);
+ }
+}
+
+
+void unpack32(unsigned int *p, unsigned int *w)
+{
+ int i;
+
+ for (i = 0; i < BS; i += 32, p += 32, w += 32)
+ {
+ p[0] = w[0];
+ p[1] = w[1];
+ p[2] = w[2];
+ p[3] = w[3];
+ p[4] = w[4];
+ p[5] = w[5];
+ p[6] = w[6];
+ p[7] = w[7];
+ p[8] = w[8];
+ p[9] = w[9];
+ p[10] = w[10];
+ p[11] = w[11];
+ p[12] = w[12];
+ p[13] = w[13];
+ p[14] = w[14];
+ p[15] = w[15];
+ p[16] = w[16];
+ p[17] = w[17];
+ p[18] = w[18];
+ p[19] = w[19];
+ p[20] = w[20];
+ p[21] = w[21];
+ p[22] = w[22];
+ p[23] = w[23];
+ p[24] = w[24];
+ p[25] = w[25];
+ p[26] = w[26];
+ p[27] = w[27];
+ p[28] = w[28];
+ p[29] = w[29];
+ p[30] = w[30];
+ p[31] = w[31];
+ }
+}
+
+
+typedef void (*pf)(unsigned int *p, unsigned int *w);
+pf unpack[17] = {unpack0, unpack1, unpack2, unpack3, unpack4, unpack5,
+ unpack6, unpack7, unpack8, unpack9, unpack10, unpack11,
+ unpack12, unpack13, unpack16, unpack20, unpack32};
+
diff --git a/ext/simdcomp/bitpacka.c b/ext/simdcomp/bitpacka.c
new file mode 100644
index 0000000..974237a
--- /dev/null
+++ b/ext/simdcomp/bitpacka.c
@@ -0,0 +1,17773 @@
+#include "bitpacka.h"
+#define INLINE inline
+uint32_t * nullpacker(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ return out;
+}
+
+ const uint32_t * nullunpacker8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ memset(out,0,8 * 4);
+ return in;
+ }
+
+
+ uint32_t * __fastpackwithoutmask1_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in++) ;
+ *out |= ( (*in++) ) << 1 ;
+ *out |= ( (*in++) ) << 2 ;
+ *out |= ( (*in++) ) << 3 ;
+ *out |= ( (*in++) ) << 4 ;
+ *out |= ( (*in++) ) << 5 ;
+ *out |= ( (*in++) ) << 6 ;
+ *out |= ( (*in++) ) << 7 ;
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask2_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in++) ;
+ *out |= ( (*in++) ) << 2 ;
+ *out |= ( (*in++) ) << 4 ;
+ *out |= ( (*in++) ) << 6 ;
+ *out |= ( (*in++) ) << 8 ;
+ *out |= ( (*in++) ) << 10 ;
+ *out |= ( (*in++) ) << 12 ;
+ *out |= ( (*in++) ) << 14 ;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask3_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask4_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask5_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask6_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask7_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask8_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask9_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask10_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask11_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask12_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask13_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask14_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask15_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask16_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask17_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask18_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask19_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask20_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask21_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask22_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask23_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask24_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask25_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask26_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask27_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 24 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask28_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask29_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask30_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask31_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 30 );
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 29 );
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 27 );
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 24 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask32_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+#if 0
+#define OUTI(__x) *out++
+#define OUT(__x) *out
+#define OUI out++
+#else
+#define OUTI(__x) out[__x]
+#define OUT(__x) out[__x]
+#define OUI
+#endif
+const INLINE uint32_t * __fastunpack1_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) & 1;
+ OUTI( 1) = ( (*in) >> 1 ) & 1;
+ OUTI( 2) = ( (*in) >> 2 ) & 1;
+ OUTI( 3) = ( (*in) >> 3 ) & 1;
+ OUTI( 4) = ( (*in) >> 4 ) & 1;
+ OUTI( 5) = ( (*in) >> 5 ) & 1;
+ OUTI( 6) = ( (*in) >> 6 ) & 1;
+ OUTI( 7) = ( (*in) >> 7 ) & 1;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack2_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 2 ) ;
+ OUTI( 1) = ( (*in) >> 2 ) % (1U << 2 ) ;
+ OUTI( 2) = ( (*in) >> 4 ) % (1U << 2 ) ;
+ OUTI( 3) = ( (*in) >> 6 ) % (1U << 2 ) ;
+ OUTI( 4) = ( (*in) >> 8 ) % (1U << 2 ) ;
+ OUTI( 5) = ( (*in) >> 10 ) % (1U << 2 ) ;
+ OUTI( 6) = ( (*in) >> 12 ) % (1U << 2 ) ;
+ OUTI( 7) = ( (*in) >> 14 ) % (1U << 2 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack3_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 3 ) ;
+ OUTI( 1) = ( (*in) >> 3 ) % (1U << 3 ) ;
+ OUTI( 2) = ( (*in) >> 6 ) % (1U << 3 ) ;
+ OUTI( 3) = ( (*in) >> 9 ) % (1U << 3 ) ;
+ OUTI( 4) = ( (*in) >> 12 ) % (1U << 3 ) ;
+ OUTI( 5) = ( (*in) >> 15 ) % (1U << 3 ) ;
+ OUTI( 6) = ( (*in) >> 18 ) % (1U << 3 ) ;
+ OUTI( 7) = ( (*in) >> 21 ) % (1U << 3 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack4_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 4 ) ;
+ OUTI( 1) = ( (*in) >> 4 ) % (1U << 4 ) ;
+ OUTI( 2) = ( (*in) >> 8 ) % (1U << 4 ) ;
+ OUTI( 3) = ( (*in) >> 12 ) % (1U << 4 ) ;
+ OUTI( 4) = ( (*in) >> 16 ) % (1U << 4 ) ;
+ OUTI( 5) = ( (*in) >> 20 ) % (1U << 4 ) ;
+ OUTI( 6) = ( (*in) >> 24 ) % (1U << 4 ) ;
+ OUTI( 7) = ( (*in++) >> 28 ) ;
+ return in;
+}
+
+const uint32_t * __fastunpack5_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 5 ) ;
+ OUTI( 1) = ( (*in) >> 5 ) % (1U << 5 ) ;
+ OUTI( 2) = ( (*in) >> 10 ) % (1U << 5 ) ;
+ OUTI( 3) = ( (*in) >> 15 ) % (1U << 5 ) ;
+ OUTI( 4) = ( (*in) >> 20 ) % (1U << 5 ) ;
+ OUTI( 5) = ( (*in) >> 25 ) % (1U << 5 ) ;
+ OUT( 6) = ( (*in++) >> 30 ) ;
+ OUT( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 );
+ OUI;
+ OUTI( 7) = ( (*in) >> 3 ) % (1U << 5 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack6_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 6 ) ;
+ OUTI( 1) = ( (*in) >> 6 ) % (1U << 6 ) ;
+ OUTI( 2) = ( (*in) >> 12 ) % (1U << 6 ) ;
+ OUTI( 3) = ( (*in) >> 18 ) % (1U << 6 ) ;
+ OUTI( 4) = ( (*in) >> 24 ) % (1U << 6 ) ;
+ OUT( 5) = ( (*in++) >> 30 ) ;
+ OUT( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+ OUI;
+ OUTI( 6) = ( (*in) >> 4 ) % (1U << 6 ) ;
+ OUTI( 7) = ( (*in) >> 10 ) % (1U << 6 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack7_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 7 ) ;
+ OUTI( 1) = ( (*in) >> 7 ) % (1U << 7 ) ;
+ OUTI( 2) = ( (*in) >> 14 ) % (1U << 7 ) ;
+ OUTI( 3) = ( (*in) >> 21 ) % (1U << 7 ) ;
+ OUT( 4) = ( (*in++) >> 28 ) ;
+ OUT( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 );
+ OUI;
+ OUTI( 5) = ( (*in) >> 3 ) % (1U << 7 ) ;
+ OUTI( 6 ) = ( (*in) >> 10 ) % (1U << 7 ) ;
+ OUTI( 7 ) = ( (*in) >> 17 ) % (1U << 7 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack8_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ OUTI( 1) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ OUTI( 2) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ OUTI( 3) = ( (*in++) >> 24 ) ;
+ OUTI( 4) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ OUTI( 5) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ OUTI( 6) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ OUTI( 7) = ( (*in++) >> 24 ) ;
+ return in;
+}
+
+const INLINE uint32_t * __fastunpack9_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 9 ) ;
+ OUTI( 1) = ( (*in) >> 9 ) % (1U << 9 ) ;
+ OUTI( 2) = ( (*in) >> 18 ) % (1U << 9 ) ;
+ OUT( 3) = ( (*in++) >> 27 ) ;
+ OUT( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 );
+ OUI;
+ OUTI( 4) = ( (*in) >> 4 ) % (1U << 9 ) ;
+ OUTI( 5) = ( (*in) >> 13 ) % (1U << 9 ) ;
+ OUTI( 6) = ( (*in) >> 22 ) % (1U << 9 ) ;
+ OUT( 7) = ( (*in++) >> 31 ) ;
+ OUT( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 );
+ OUI;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack10_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 10 ) ;
+ OUTI( 1) = ( (*in) >> 10 ) % (1U << 10 ) ;
+ OUTI( 2) = ( (*in) >> 20 ) % (1U << 10 ) ;
+ OUT( 3) = ( (*in++) >> 30 ) ;
+ OUT( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+ OUI;
+ OUTI( 4) = ( (*in) >> 8 ) % (1U << 10 ) ;
+ OUTI( 5) = ( (*in) >> 18 ) % (1U << 10 ) ;
+ OUT( 6) = ( (*in++) >> 28 ) ;
+ OUT( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+ OUI;
+ OUTI( 7) = ( (*in) >> 6 ) % (1U << 10 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack11_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ((*in) >> 0 ) % (1U << 11 ) ;
+ OUTI( 1) = ((*in) >> 11 ) % (1U << 11 ) ;
+ OUT( 2) = ((*in++) >> 22 ) ;
+ OUT( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 );
+ OUI;
+ OUTI( 3) = ( (*in) >> 1 ) % (1U << 11 ) ;
+ OUTI( 4) = ((*in) >> 12 ) % (1U << 11 ) ;
+ OUT( 5) = (*in++) >> 23;
+ OUT( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 );
+ OUI;
+ OUTI( 6) = ((*in) >> 2 ) % (1U << 11 ) ;
+ OUTI( 7) = ((*in) >> 13 ) % (1U << 11 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack12_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 12 ) ;
+ OUTI( 1) = ( (*in) >> 12 ) % (1U << 12 ) ;
+ OUT( 2) = ( (*in++) >> 24 ) ;
+ OUT( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ OUI;
+ OUTI( 3) = ( (*in) >> 4 ) % (1U << 12 ) ;
+ OUTI( 4) = ( (*in) >> 16 ) % (1U << 12 ) ;
+ OUT( 5) = ( (*in++) >> 28 ) ;
+ OUT( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ OUI;
+ OUTI( 6) = ( (*in) >> 8 ) % (1U << 12 ) ;
+ OUTI( 7) = ( (*in++) >> 20 ) ;
+ return in;
+}
+
+const INLINE uint32_t * __fastunpack13_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ OUTI( 0) = ( (*in) >> 0 ) % (1U << 13 ) ;
+ OUTI( 1) = ( (*in) >> 13 ) % (1U << 13 ) ;
+ OUT( 2) = ( (*in++) >> 26 ) ;
+ OUT( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 );
+ OUI;
+ OUTI( 3) = ( (*in) >> 7 ) % (1U << 13 ) ;
+ OUT( 4) = ( (*in++) >> 20 ) ;
+ OUT( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 );
+ OUI;
+ OUTI( 5) = ( (*in) >> 1 ) % (1U << 13 ) ;
+ OUTI( 6) = ( (*in) >> 14 ) % (1U << 13 ) ;
+ OUT( 7) = ( (*in++) >> 27 );
+ OUT( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 );
+ OUI;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack14_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ *out++ = ( (*in) >> 0 ) % (1U << 14 ) ;
+ *out++ = ( (*in) >> 14 ) % (1U << 14 ) ;
+ *out = ( (*in++) >> 28 ) ;
+ *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+ out++;
+ *out++ = ( (*in) >> 10 ) % (1U << 14 ) ;
+ *out = ( (*in++) >> 24 ) ;
+ *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+ out++;
+ *out++ = ( (*in) >> 6 ) % (1U << 14 ) ;
+ *out = ( (*in++) >> 20 ) ;
+ *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+ out++;
+ *out++ = ( (*in) >> 2 ) % (1U << 14 ) ;
+ return in + 1;
+}
+
+const INLINE uint32_t * __fastunpack15_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 15 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack16_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack17_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack18_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack19_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 19 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack20_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack21_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const INLINE uint32_t * __fastunpack22_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack23_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 23 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack24_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack25_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack26_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack27_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack28_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack29_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack30_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 30 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack31_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 31 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 );
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 );
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 );
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack32_8(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+ const uint32_t * fastunpack_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullunpacker8(in,out);
+
+ case 1:
+ return __fastunpack1_8(in,out);
+
+ case 2:
+ return __fastunpack2_8(in,out);
+
+ case 3:
+ return __fastunpack3_8(in,out);
+
+ case 4:
+ return __fastunpack4_8(in,out);
+
+ case 5:
+ return __fastunpack5_8(in,out);
+
+ case 6:
+ return __fastunpack6_8(in,out);
+
+ case 7:
+ return __fastunpack7_8(in,out);
+
+ case 8:
+ return __fastunpack8_8(in,out);
+
+ case 9:
+ return __fastunpack9_8(in,out);
+
+ case 10:
+ return __fastunpack10_8(in,out);
+
+ case 11:
+ return __fastunpack11_8(in,out);
+
+ case 12:
+ return __fastunpack12_8(in,out);
+
+ case 13:
+ return __fastunpack13_8(in,out);
+
+ case 14:
+ return __fastunpack14_8(in,out);
+
+ case 15:
+ return __fastunpack15_8(in,out);
+
+ case 16:
+ return __fastunpack16_8(in,out);
+
+ case 17:
+ return __fastunpack17_8(in,out);
+
+ case 18:
+ return __fastunpack18_8(in,out);
+
+ case 19:
+ return __fastunpack19_8(in,out);
+
+ case 20:
+ return __fastunpack20_8(in,out);
+
+ case 21:
+ return __fastunpack21_8(in,out);
+
+ case 22:
+ return __fastunpack22_8(in,out);
+
+ case 23:
+ return __fastunpack23_8(in,out);
+
+ case 24:
+ return __fastunpack24_8(in,out);
+
+ case 25:
+ return __fastunpack25_8(in,out);
+
+ case 26:
+ return __fastunpack26_8(in,out);
+
+ case 27:
+ return __fastunpack27_8(in,out);
+
+ case 28:
+ return __fastunpack28_8(in,out);
+
+ case 29:
+ return __fastunpack29_8(in,out);
+
+ case 30:
+ return __fastunpack30_8(in,out);
+
+ case 31:
+ return __fastunpack31_8(in,out);
+
+ case 32:
+ return __fastunpack32_8(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+ uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullpacker(in,out);
+
+ case 1:
+ return __fastpackwithoutmask1_8(in,out);
+
+ case 2:
+ return __fastpackwithoutmask2_8(in,out);
+
+ case 3:
+ return __fastpackwithoutmask3_8(in,out);
+
+ case 4:
+ return __fastpackwithoutmask4_8(in,out);
+
+ case 5:
+ return __fastpackwithoutmask5_8(in,out);
+
+ case 6:
+ return __fastpackwithoutmask6_8(in,out);
+
+ case 7:
+ return __fastpackwithoutmask7_8(in,out);
+
+ case 8:
+ return __fastpackwithoutmask8_8(in,out);
+
+ case 9:
+ return __fastpackwithoutmask9_8(in,out);
+
+ case 10:
+ return __fastpackwithoutmask10_8(in,out);
+
+ case 11:
+ return __fastpackwithoutmask11_8(in,out);
+
+ case 12:
+ return __fastpackwithoutmask12_8(in,out);
+
+ case 13:
+ return __fastpackwithoutmask13_8(in,out);
+
+ case 14:
+ return __fastpackwithoutmask14_8(in,out);
+
+ case 15:
+ return __fastpackwithoutmask15_8(in,out);
+
+ case 16:
+ return __fastpackwithoutmask16_8(in,out);
+
+ case 17:
+ return __fastpackwithoutmask17_8(in,out);
+
+ case 18:
+ return __fastpackwithoutmask18_8(in,out);
+
+ case 19:
+ return __fastpackwithoutmask19_8(in,out);
+
+ case 20:
+ return __fastpackwithoutmask20_8(in,out);
+
+ case 21:
+ return __fastpackwithoutmask21_8(in,out);
+
+ case 22:
+ return __fastpackwithoutmask22_8(in,out);
+
+ case 23:
+ return __fastpackwithoutmask23_8(in,out);
+
+ case 24:
+ return __fastpackwithoutmask24_8(in,out);
+
+ case 25:
+ return __fastpackwithoutmask25_8(in,out);
+
+ case 26:
+ return __fastpackwithoutmask26_8(in,out);
+
+ case 27:
+ return __fastpackwithoutmask27_8(in,out);
+
+ case 28:
+ return __fastpackwithoutmask28_8(in,out);
+
+ case 29:
+ return __fastpackwithoutmask29_8(in,out);
+
+ case 30:
+ return __fastpackwithoutmask30_8(in,out);
+
+ case 31:
+ return __fastpackwithoutmask31_8(in,out);
+
+ case 32:
+ return __fastpackwithoutmask32_8(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+ const uint32_t * nullunpacker16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ memset(out,0,16 * 4);
+ return in;
+ }
+
+
+ uint32_t * __fastpackwithoutmask1_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask2_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask3_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 3 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask4_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask5_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask6_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask7_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask8_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask9_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask10_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask11_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask12_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask13_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask14_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask15_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask16_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask17_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask18_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask19_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask20_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask21_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask22_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask23_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask24_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask25_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask26_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask27_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask28_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask29_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask30_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask31_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 30 );
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 29 );
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 27 );
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask32_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+
+const uint32_t * __fastunpack1_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 1 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 2 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 3 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 4 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 5 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 6 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 7 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 8 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 9 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 10 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 11 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 12 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 13 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 14 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 15 ) & 1 ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack2_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack3_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 3 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack4_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack5_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 5 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack6_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack7_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 7 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack8_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack9_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 9 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack10_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack11_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 11 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack12_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack13_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 13 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack14_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack15_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 15 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack16_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack17_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack18_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack19_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack20_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack21_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack22_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack23_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack24_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack25_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack26_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack27_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack28_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack29_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack30_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 30 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack31_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 31 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 );
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 );
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 );
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack32_16(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+ const uint32_t * fastunpack_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullunpacker16(in,out);
+
+ case 1:
+ return __fastunpack1_16(in,out);
+
+ case 2:
+ return __fastunpack2_16(in,out);
+
+ case 3:
+ return __fastunpack3_16(in,out);
+
+ case 4:
+ return __fastunpack4_16(in,out);
+
+ case 5:
+ return __fastunpack5_16(in,out);
+
+ case 6:
+ return __fastunpack6_16(in,out);
+
+ case 7:
+ return __fastunpack7_16(in,out);
+
+ case 8:
+ return __fastunpack8_16(in,out);
+
+ case 9:
+ return __fastunpack9_16(in,out);
+
+ case 10:
+ return __fastunpack10_16(in,out);
+
+ case 11:
+ return __fastunpack11_16(in,out);
+
+ case 12:
+ return __fastunpack12_16(in,out);
+
+ case 13:
+ return __fastunpack13_16(in,out);
+
+ case 14:
+ return __fastunpack14_16(in,out);
+
+ case 15:
+ return __fastunpack15_16(in,out);
+
+ case 16:
+ return __fastunpack16_16(in,out);
+
+ case 17:
+ return __fastunpack17_16(in,out);
+
+ case 18:
+ return __fastunpack18_16(in,out);
+
+ case 19:
+ return __fastunpack19_16(in,out);
+
+ case 20:
+ return __fastunpack20_16(in,out);
+
+ case 21:
+ return __fastunpack21_16(in,out);
+
+ case 22:
+ return __fastunpack22_16(in,out);
+
+ case 23:
+ return __fastunpack23_16(in,out);
+
+ case 24:
+ return __fastunpack24_16(in,out);
+
+ case 25:
+ return __fastunpack25_16(in,out);
+
+ case 26:
+ return __fastunpack26_16(in,out);
+
+ case 27:
+ return __fastunpack27_16(in,out);
+
+ case 28:
+ return __fastunpack28_16(in,out);
+
+ case 29:
+ return __fastunpack29_16(in,out);
+
+ case 30:
+ return __fastunpack30_16(in,out);
+
+ case 31:
+ return __fastunpack31_16(in,out);
+
+ case 32:
+ return __fastunpack32_16(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+ uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullpacker(in,out);
+
+ case 1:
+ return __fastpackwithoutmask1_16(in,out);
+
+ case 2:
+ return __fastpackwithoutmask2_16(in,out);
+
+ case 3:
+ return __fastpackwithoutmask3_16(in,out);
+
+ case 4:
+ return __fastpackwithoutmask4_16(in,out);
+
+ case 5:
+ return __fastpackwithoutmask5_16(in,out);
+
+ case 6:
+ return __fastpackwithoutmask6_16(in,out);
+
+ case 7:
+ return __fastpackwithoutmask7_16(in,out);
+
+ case 8:
+ return __fastpackwithoutmask8_16(in,out);
+
+ case 9:
+ return __fastpackwithoutmask9_16(in,out);
+
+ case 10:
+ return __fastpackwithoutmask10_16(in,out);
+
+ case 11:
+ return __fastpackwithoutmask11_16(in,out);
+
+ case 12:
+ return __fastpackwithoutmask12_16(in,out);
+
+ case 13:
+ return __fastpackwithoutmask13_16(in,out);
+
+ case 14:
+ return __fastpackwithoutmask14_16(in,out);
+
+ case 15:
+ return __fastpackwithoutmask15_16(in,out);
+
+ case 16:
+ return __fastpackwithoutmask16_16(in,out);
+
+ case 17:
+ return __fastpackwithoutmask17_16(in,out);
+
+ case 18:
+ return __fastpackwithoutmask18_16(in,out);
+
+ case 19:
+ return __fastpackwithoutmask19_16(in,out);
+
+ case 20:
+ return __fastpackwithoutmask20_16(in,out);
+
+ case 21:
+ return __fastpackwithoutmask21_16(in,out);
+
+ case 22:
+ return __fastpackwithoutmask22_16(in,out);
+
+ case 23:
+ return __fastpackwithoutmask23_16(in,out);
+
+ case 24:
+ return __fastpackwithoutmask24_16(in,out);
+
+ case 25:
+ return __fastpackwithoutmask25_16(in,out);
+
+ case 26:
+ return __fastpackwithoutmask26_16(in,out);
+
+ case 27:
+ return __fastpackwithoutmask27_16(in,out);
+
+ case 28:
+ return __fastpackwithoutmask28_16(in,out);
+
+ case 29:
+ return __fastpackwithoutmask29_16(in,out);
+
+ case 30:
+ return __fastpackwithoutmask30_16(in,out);
+
+ case 31:
+ return __fastpackwithoutmask31_16(in,out);
+
+ case 32:
+ return __fastpackwithoutmask32_16(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+ const uint32_t * nullunpacker24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ memset(out,0,24 * 4);
+ return in;
+ }
+
+
+ uint32_t * __fastpackwithoutmask1_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask2_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask3_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 3 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 3 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask4_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask5_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask6_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask7_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask8_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask9_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask10_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask11_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask12_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask13_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask14_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask15_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask16_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask17_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask18_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask19_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask20_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask21_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask22_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask23_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask24_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask25_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 24 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask26_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask27_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask28_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask29_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 27 );
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 24 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask30_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 16 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask31_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 30 );
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 29 );
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 27 );
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 8 );
+ ++in;
+
+ return out + 1;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask32_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+
+const uint32_t * __fastunpack1_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 1 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 2 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 3 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 4 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 5 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 6 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 7 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 8 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 9 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 10 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 11 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 12 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 13 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 14 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 15 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 16 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 17 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 18 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 19 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 20 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 21 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 22 ) & 1 ;
+ out++;
+ *out = ( (*in) >> 23 ) & 1 ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack2_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 2 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 2 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack3_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 3 ) ;
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 3 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack4_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 4 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack5_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 5 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) % (1U << 5 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack6_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 6 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 6 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack7_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) % (1U << 7 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 7 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack8_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 8 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack9_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 9 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 9 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack10_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 10 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 10 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack11_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) % (1U << 11 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack12_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 12 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack13_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) % (1U << 13 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 13 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack14_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 14 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 14 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack15_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 15 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 15 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack16_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 16 ) ;
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack17_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 17 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 17 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack18_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack19_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack20_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack21_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 21 ) ;
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack22_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack23_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack24_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack25_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack26_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack27_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack28_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack29_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 );
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack30_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 30 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 30 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack31_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 31 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 );
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 );
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 );
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 );
+ out++;
+
+ return in + 1;
+ }
+
+
+
+
+const uint32_t * __fastunpack32_24(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+ const uint32_t * fastunpack_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullunpacker24(in,out);
+
+ case 1:
+ return __fastunpack1_24(in,out);
+
+ case 2:
+ return __fastunpack2_24(in,out);
+
+ case 3:
+ return __fastunpack3_24(in,out);
+
+ case 4:
+ return __fastunpack4_24(in,out);
+
+ case 5:
+ return __fastunpack5_24(in,out);
+
+ case 6:
+ return __fastunpack6_24(in,out);
+
+ case 7:
+ return __fastunpack7_24(in,out);
+
+ case 8:
+ return __fastunpack8_24(in,out);
+
+ case 9:
+ return __fastunpack9_24(in,out);
+
+ case 10:
+ return __fastunpack10_24(in,out);
+
+ case 11:
+ return __fastunpack11_24(in,out);
+
+ case 12:
+ return __fastunpack12_24(in,out);
+
+ case 13:
+ return __fastunpack13_24(in,out);
+
+ case 14:
+ return __fastunpack14_24(in,out);
+
+ case 15:
+ return __fastunpack15_24(in,out);
+
+ case 16:
+ return __fastunpack16_24(in,out);
+
+ case 17:
+ return __fastunpack17_24(in,out);
+
+ case 18:
+ return __fastunpack18_24(in,out);
+
+ case 19:
+ return __fastunpack19_24(in,out);
+
+ case 20:
+ return __fastunpack20_24(in,out);
+
+ case 21:
+ return __fastunpack21_24(in,out);
+
+ case 22:
+ return __fastunpack22_24(in,out);
+
+ case 23:
+ return __fastunpack23_24(in,out);
+
+ case 24:
+ return __fastunpack24_24(in,out);
+
+ case 25:
+ return __fastunpack25_24(in,out);
+
+ case 26:
+ return __fastunpack26_24(in,out);
+
+ case 27:
+ return __fastunpack27_24(in,out);
+
+ case 28:
+ return __fastunpack28_24(in,out);
+
+ case 29:
+ return __fastunpack29_24(in,out);
+
+ case 30:
+ return __fastunpack30_24(in,out);
+
+ case 31:
+ return __fastunpack31_24(in,out);
+
+ case 32:
+ return __fastunpack32_24(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+ uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullpacker(in,out);
+
+ case 1:
+ return __fastpackwithoutmask1_24(in,out);
+
+ case 2:
+ return __fastpackwithoutmask2_24(in,out);
+
+ case 3:
+ return __fastpackwithoutmask3_24(in,out);
+
+ case 4:
+ return __fastpackwithoutmask4_24(in,out);
+
+ case 5:
+ return __fastpackwithoutmask5_24(in,out);
+
+ case 6:
+ return __fastpackwithoutmask6_24(in,out);
+
+ case 7:
+ return __fastpackwithoutmask7_24(in,out);
+
+ case 8:
+ return __fastpackwithoutmask8_24(in,out);
+
+ case 9:
+ return __fastpackwithoutmask9_24(in,out);
+
+ case 10:
+ return __fastpackwithoutmask10_24(in,out);
+
+ case 11:
+ return __fastpackwithoutmask11_24(in,out);
+
+ case 12:
+ return __fastpackwithoutmask12_24(in,out);
+
+ case 13:
+ return __fastpackwithoutmask13_24(in,out);
+
+ case 14:
+ return __fastpackwithoutmask14_24(in,out);
+
+ case 15:
+ return __fastpackwithoutmask15_24(in,out);
+
+ case 16:
+ return __fastpackwithoutmask16_24(in,out);
+
+ case 17:
+ return __fastpackwithoutmask17_24(in,out);
+
+ case 18:
+ return __fastpackwithoutmask18_24(in,out);
+
+ case 19:
+ return __fastpackwithoutmask19_24(in,out);
+
+ case 20:
+ return __fastpackwithoutmask20_24(in,out);
+
+ case 21:
+ return __fastpackwithoutmask21_24(in,out);
+
+ case 22:
+ return __fastpackwithoutmask22_24(in,out);
+
+ case 23:
+ return __fastpackwithoutmask23_24(in,out);
+
+ case 24:
+ return __fastpackwithoutmask24_24(in,out);
+
+ case 25:
+ return __fastpackwithoutmask25_24(in,out);
+
+ case 26:
+ return __fastpackwithoutmask26_24(in,out);
+
+ case 27:
+ return __fastpackwithoutmask27_24(in,out);
+
+ case 28:
+ return __fastpackwithoutmask28_24(in,out);
+
+ case 29:
+ return __fastpackwithoutmask29_24(in,out);
+
+ case 30:
+ return __fastpackwithoutmask30_24(in,out);
+
+ case 31:
+ return __fastpackwithoutmask31_24(in,out);
+
+ case 32:
+ return __fastpackwithoutmask32_24(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+ const uint32_t * nullunpacker32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+ memset(out,0,32 * 4);
+ return in;
+ }
+
+
+ uint32_t * __fastpackwithoutmask1_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask2_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask3_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 3 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 3 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask4_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask5_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 5 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask6_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 6 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask7_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 7 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask8_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask9_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 9 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask10_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 10 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask11_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 11 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask12_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 12 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask13_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 13 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask14_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 14 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask15_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 15 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask16_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask17_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 17 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask18_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 18 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask19_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 19 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask20_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 20 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask21_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 21 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask22_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 22 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask23_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 23 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask24_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 24 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask25_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 25 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask26_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 26 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask27_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 27 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask28_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 28 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask29_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 27 );
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 29 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask30_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 30 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask31_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++in;
+ *out |= ( (*in) ) << 31 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 30 );
+ ++in;
+ *out |= ( (*in) ) << 30 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 29 );
+ ++in;
+ *out |= ( (*in) ) << 29 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 28 );
+ ++in;
+ *out |= ( (*in) ) << 28 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 27 );
+ ++in;
+ *out |= ( (*in) ) << 27 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 26 );
+ ++in;
+ *out |= ( (*in) ) << 26 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 25 );
+ ++in;
+ *out |= ( (*in) ) << 25 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 24 );
+ ++in;
+ *out |= ( (*in) ) << 24 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 23 );
+ ++in;
+ *out |= ( (*in) ) << 23 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 22 );
+ ++in;
+ *out |= ( (*in) ) << 22 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 21 );
+ ++in;
+ *out |= ( (*in) ) << 21 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 20 );
+ ++in;
+ *out |= ( (*in) ) << 20 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 19 );
+ ++in;
+ *out |= ( (*in) ) << 19 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 18 );
+ ++in;
+ *out |= ( (*in) ) << 18 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 17 );
+ ++in;
+ *out |= ( (*in) ) << 17 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 16 );
+ ++in;
+ *out |= ( (*in) ) << 16 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 15 );
+ ++in;
+ *out |= ( (*in) ) << 15 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 14 );
+ ++in;
+ *out |= ( (*in) ) << 14 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 13 );
+ ++in;
+ *out |= ( (*in) ) << 13 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 12 );
+ ++in;
+ *out |= ( (*in) ) << 12 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 11 );
+ ++in;
+ *out |= ( (*in) ) << 11 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 10 );
+ ++in;
+ *out |= ( (*in) ) << 10 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 9 );
+ ++in;
+ *out |= ( (*in) ) << 9 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 8 );
+ ++in;
+ *out |= ( (*in) ) << 8 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 7 );
+ ++in;
+ *out |= ( (*in) ) << 7 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 6 );
+ ++in;
+ *out |= ( (*in) ) << 6 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 5 );
+ ++in;
+ *out |= ( (*in) ) << 5 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 4 );
+ ++in;
+ *out |= ( (*in) ) << 4 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 3 );
+ ++in;
+ *out |= ( (*in) ) << 3 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 2 );
+ ++in;
+ *out |= ( (*in) ) << 2 ;
+ ++out;
+ *out = ( (*in) ) >> ( 31 - 1 );
+ ++in;
+ *out |= ( (*in) ) << 1 ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+
+
+ uint32_t * __fastpackwithoutmask32_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+ *out = (*in) ;
+ ++out;
+ ++in;
+
+ return out;
+ }
+
+#if 1
+#define DST(__x) out[__x]
+#define DSI
+#else
+#define DST(__x) *out++
+#define DSI
+#endif
+
+const uint32_t * __fastunpack1_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) & 1 ;
+ DSI;
+ DST( 1) = ( (*in) >> 1 ) & 1 ;
+ DSI;
+ DST( 2) = ( (*in) >> 2 ) & 1 ;
+ DSI;
+ DST( 3) = ( (*in) >> 3 ) & 1 ;
+ DSI;
+ DST( 4) = ( (*in) >> 4 ) & 1 ;
+ DSI;
+ DST( 5) = ( (*in) >> 5 ) & 1 ;
+ DSI;
+ DST( 6) = ( (*in) >> 6 ) & 1 ;
+ DSI;
+ DST( 7) = ( (*in) >> 7 ) & 1 ;
+ DSI;
+ DST( 8) = ( (*in) >> 8 ) & 1 ;
+ DSI;
+ DST( 9) = ( (*in) >> 9 ) & 1 ;
+ DSI;
+ DST(10) = ( (*in) >> 10 ) & 1 ;
+ DSI;
+ DST(11) = ( (*in) >> 11 ) & 1 ;
+ DSI;
+ DST(12) = ( (*in) >> 12 ) & 1 ;
+ DSI;
+ DST(13) = ( (*in) >> 13 ) & 1 ;
+ DSI;
+ DST(14) = ( (*in) >> 14 ) & 1 ;
+ DSI;
+ DST(15) = ( (*in) >> 15 ) & 1 ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) & 1 ;
+ DSI;
+ DST(17) = ( (*in) >> 17 ) & 1 ;
+ DSI;
+ DST(18) = ( (*in) >> 18 ) & 1 ;
+ DSI;
+ DST(19) = ( (*in) >> 19 ) & 1 ;
+ DSI;
+ DST(20) = ( (*in) >> 20 ) & 1 ;
+ DSI;
+ DST(21) = ( (*in) >> 21 ) & 1 ;
+ DSI;
+ DST(22) = ( (*in) >> 22 ) & 1 ;
+ DSI;
+ DST(23) = ( (*in) >> 23 ) & 1 ;
+ DSI;
+ DST(24) = ( (*in) >> 24 ) & 1 ;
+ DSI;
+ DST(25) = ( (*in) >> 25 ) & 1 ;
+ DSI;
+ DST(26) = ( (*in) >> 26 ) & 1 ;
+ DSI;
+ DST(27) = ( (*in) >> 27 ) & 1 ;
+ DSI;
+ DST(28) = ( (*in) >> 28 ) & 1 ;
+ DSI;
+ DST(29) = ( (*in) >> 29 ) & 1 ;
+ DSI;
+ DST(30) = ( (*in) >> 30 ) & 1 ;
+ DSI;
+ DST(31) = ( (*in) >> 31 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack2_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 2 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 2 ) % (1U << 2 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 4 ) % (1U << 2 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 6 ) % (1U << 2 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 8 ) % (1U << 2 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 10 ) % (1U << 2 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 12 ) % (1U << 2 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 14 ) % (1U << 2 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 16 ) % (1U << 2 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 18 ) % (1U << 2 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 20 ) % (1U << 2 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 22 ) % (1U << 2 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 24 ) % (1U << 2 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 26 ) % (1U << 2 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 28 ) % (1U << 2 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 30 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 2 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 2 ) % (1U << 2 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 4 ) % (1U << 2 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 6 ) % (1U << 2 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 8 ) % (1U << 2 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 10 ) % (1U << 2 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 12 ) % (1U << 2 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 14 ) % (1U << 2 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 16 ) % (1U << 2 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 18 ) % (1U << 2 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 20 ) % (1U << 2 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 22 ) % (1U << 2 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 24 ) % (1U << 2 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 26 ) % (1U << 2 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 28 ) % (1U << 2 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 30 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack3_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 3 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 3 ) % (1U << 3 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 6 ) % (1U << 3 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 9 ) % (1U << 3 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 12 ) % (1U << 3 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 15 ) % (1U << 3 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 18 ) % (1U << 3 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 21 ) % (1U << 3 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 24 ) % (1U << 3 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 27 ) % (1U << 3 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(10) |= ((*in) % (1U<< 1 ))<<( 3 - 1 );
+ DSI;
+ DST(11) = ( (*in) >> 1 ) % (1U << 3 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 4 ) % (1U << 3 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 7 ) % (1U << 3 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 10 ) % (1U << 3 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 13 ) % (1U << 3 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 3 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 19 ) % (1U << 3 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 22 ) % (1U << 3 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 25 ) % (1U << 3 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 28 ) % (1U << 3 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 31 ) ;
+ ++in;
+ DST(21) |= ((*in) % (1U<< 2 ))<<( 3 - 2 );
+ DSI;
+ DST(22) = ( (*in) >> 2 ) % (1U << 3 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 5 ) % (1U << 3 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 8 ) % (1U << 3 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 11 ) % (1U << 3 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 14 ) % (1U << 3 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 17 ) % (1U << 3 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 20 ) % (1U << 3 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 23 ) % (1U << 3 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 26 ) % (1U << 3 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 29 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack4_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 4 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 4 ) % (1U << 4 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 8 ) % (1U << 4 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 12 ) % (1U << 4 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 16 ) % (1U << 4 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 20 ) % (1U << 4 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 24 ) % (1U << 4 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 28 ) ;
+ ++in;
+ DSI;
+ DST( 8) = ( (*in) >> 0 ) % (1U << 4 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 4 ) % (1U << 4 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 8 ) % (1U << 4 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 12 ) % (1U << 4 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 16 ) % (1U << 4 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 20 ) % (1U << 4 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 24 ) % (1U << 4 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 28 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 4 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 4 ) % (1U << 4 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 8 ) % (1U << 4 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 12 ) % (1U << 4 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 16 ) % (1U << 4 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 20 ) % (1U << 4 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 24 ) % (1U << 4 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 28 ) ;
+ ++in;
+ DSI;
+ DST(24) = ( (*in) >> 0 ) % (1U << 4 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 4 ) % (1U << 4 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 8 ) % (1U << 4 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 12 ) % (1U << 4 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 16 ) % (1U << 4 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 20 ) % (1U << 4 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 24 ) % (1U << 4 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 28 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack5_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 5 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 5 ) % (1U << 5 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 10 ) % (1U << 5 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 15 ) % (1U << 5 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 20 ) % (1U << 5 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 25 ) % (1U << 5 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 30 ) ;
+ ++in;
+ DST( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 );
+ DSI;
+ DST( 7) = ( (*in) >> 3 ) % (1U << 5 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 8 ) % (1U << 5 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 13 ) % (1U << 5 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 18 ) % (1U << 5 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 23 ) % (1U << 5 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(12) |= ((*in) % (1U<< 1 ))<<( 5 - 1 );
+ DSI;
+ DST(13) = ( (*in) >> 1 ) % (1U << 5 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 6 ) % (1U << 5 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 11 ) % (1U << 5 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 5 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 21 ) % (1U << 5 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 26 ) % (1U << 5 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 31 ) ;
+ ++in;
+ DST(19) |= ((*in) % (1U<< 4 ))<<( 5 - 4 );
+ DSI;
+ DST(20) = ( (*in) >> 4 ) % (1U << 5 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 9 ) % (1U << 5 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 14 ) % (1U << 5 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 19 ) % (1U << 5 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 24 ) % (1U << 5 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(25) |= ((*in) % (1U<< 2 ))<<( 5 - 2 );
+ DSI;
+ DST(26) = ( (*in) >> 2 ) % (1U << 5 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 7 ) % (1U << 5 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 12 ) % (1U << 5 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 17 ) % (1U << 5 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 22 ) % (1U << 5 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 27 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack6_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 6 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 6 ) % (1U << 6 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 12 ) % (1U << 6 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 18 ) % (1U << 6 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 24 ) % (1U << 6 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 30 ) ;
+ ++in;
+ DST( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+ DSI;
+ DST( 6) = ( (*in) >> 4 ) % (1U << 6 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 10 ) % (1U << 6 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 16 ) % (1U << 6 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 22 ) % (1U << 6 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(10) |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+ DSI;
+ DST(11) = ( (*in) >> 2 ) % (1U << 6 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 8 ) % (1U << 6 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 14 ) % (1U << 6 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 20 ) % (1U << 6 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 26 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 6 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 6 ) % (1U << 6 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 12 ) % (1U << 6 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 18 ) % (1U << 6 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 24 ) % (1U << 6 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(21) |= ((*in) % (1U<< 4 ))<<( 6 - 4 );
+ DSI;
+ DST(22) = ( (*in) >> 4 ) % (1U << 6 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 10 ) % (1U << 6 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 16 ) % (1U << 6 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 22 ) % (1U << 6 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(26) |= ((*in) % (1U<< 2 ))<<( 6 - 2 );
+ DSI;
+ DST(27) = ( (*in) >> 2 ) % (1U << 6 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 8 ) % (1U << 6 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 14 ) % (1U << 6 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 20 ) % (1U << 6 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 26 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack7_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 7 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 7 ) % (1U << 7 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 14 ) % (1U << 7 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 21 ) % (1U << 7 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 28 ) ;
+ ++in;
+ DST( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 );
+ DSI;
+ DST( 5) = ( (*in) >> 3 ) % (1U << 7 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 10 ) % (1U << 7 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 17 ) % (1U << 7 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 24 ) % (1U << 7 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 31 ) ;
+ ++in;
+ DST( 9) |= ((*in) % (1U<< 6 ))<<( 7 - 6 );
+ DSI;
+ DST(10) = ( (*in) >> 6 ) % (1U << 7 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 13 ) % (1U << 7 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 20 ) % (1U << 7 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 27 ) ;
+ ++in;
+ DST(13) |= ((*in) % (1U<< 2 ))<<( 7 - 2 );
+ DSI;
+ DST(14) = ( (*in) >> 2 ) % (1U << 7 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 9 ) % (1U << 7 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 7 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 23 ) % (1U << 7 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(18) |= ((*in) % (1U<< 5 ))<<( 7 - 5 );
+ DSI;
+ DST(19) = ( (*in) >> 5 ) % (1U << 7 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 12 ) % (1U << 7 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 19 ) % (1U << 7 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(22) |= ((*in) % (1U<< 1 ))<<( 7 - 1 );
+ DSI;
+ DST(23) = ( (*in) >> 1 ) % (1U << 7 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 8 ) % (1U << 7 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 15 ) % (1U << 7 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 22 ) % (1U << 7 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(27) |= ((*in) % (1U<< 4 ))<<( 7 - 4 );
+ DSI;
+ DST(28) = ( (*in) >> 4 ) % (1U << 7 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 11 ) % (1U << 7 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 18 ) % (1U << 7 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 25 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack8_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST( 4) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST( 8) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST(12) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST(20) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST(24) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+ DST(28) = ( (*in) >> 0 ) % (1U << 8 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 8 ) % (1U << 8 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 16 ) % (1U << 8 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 24 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack9_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 9 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 9 ) % (1U << 9 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 18 ) % (1U << 9 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 27 ) ;
+ ++in;
+ DST( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 );
+ DSI;
+ DST( 4) = ( (*in) >> 4 ) % (1U << 9 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 13 ) % (1U << 9 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 22 ) % (1U << 9 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 31 ) ;
+ ++in;
+ DST( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 );
+ DSI;
+ DST( 8) = ( (*in) >> 8 ) % (1U << 9 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 17 ) % (1U << 9 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(10) |= ((*in) % (1U<< 3 ))<<( 9 - 3 );
+ DSI;
+ DST(11) = ( (*in) >> 3 ) % (1U << 9 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 12 ) % (1U << 9 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 21 ) % (1U << 9 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(14) |= ((*in) % (1U<< 7 ))<<( 9 - 7 );
+ DSI;
+ DST(15) = ( (*in) >> 7 ) % (1U << 9 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 9 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 25 ) ;
+ ++in;
+ DST(17) |= ((*in) % (1U<< 2 ))<<( 9 - 2 );
+ DSI;
+ DST(18) = ( (*in) >> 2 ) % (1U << 9 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 11 ) % (1U << 9 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 20 ) % (1U << 9 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(21) |= ((*in) % (1U<< 6 ))<<( 9 - 6 );
+ DSI;
+ DST(22) = ( (*in) >> 6 ) % (1U << 9 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 15 ) % (1U << 9 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(24) |= ((*in) % (1U<< 1 ))<<( 9 - 1 );
+ DSI;
+ DST(25) = ( (*in) >> 1 ) % (1U << 9 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 10 ) % (1U << 9 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 19 ) % (1U << 9 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(28) |= ((*in) % (1U<< 5 ))<<( 9 - 5 );
+ DSI;
+ DST(29) = ( (*in) >> 5 ) % (1U << 9 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 14 ) % (1U << 9 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 23 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack10_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 10 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 10 ) % (1U << 10 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 20 ) % (1U << 10 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 30 ) ;
+ ++in;
+ DST( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+ DSI;
+ DST( 4) = ( (*in) >> 8 ) % (1U << 10 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 18 ) % (1U << 10 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 28 ) ;
+ ++in;
+ DST( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+ DSI;
+ DST( 7) = ( (*in) >> 6 ) % (1U << 10 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 16 ) % (1U << 10 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 26 ) ;
+ ++in;
+ DST( 9) |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+ DSI;
+ DST(10) = ( (*in) >> 4 ) % (1U << 10 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 14 ) % (1U << 10 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(12) |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+ DSI;
+ DST(13) = ( (*in) >> 2 ) % (1U << 10 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 12 ) % (1U << 10 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 22 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 10 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 10 ) % (1U << 10 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 20 ) % (1U << 10 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(19) |= ((*in) % (1U<< 8 ))<<( 10 - 8 );
+ DSI;
+ DST(20) = ( (*in) >> 8 ) % (1U << 10 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 18 ) % (1U << 10 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(22) |= ((*in) % (1U<< 6 ))<<( 10 - 6 );
+ DSI;
+ DST(23) = ( (*in) >> 6 ) % (1U << 10 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 16 ) % (1U << 10 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(25) |= ((*in) % (1U<< 4 ))<<( 10 - 4 );
+ DSI;
+ DST(26) = ( (*in) >> 4 ) % (1U << 10 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 14 ) % (1U << 10 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(28) |= ((*in) % (1U<< 2 ))<<( 10 - 2 );
+ DSI;
+ DST(29) = ( (*in) >> 2 ) % (1U << 10 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 12 ) % (1U << 10 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 22 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack11_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 11 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 11 ) % (1U << 11 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 22 ) ;
+ ++in;
+ DST( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 );
+ DSI;
+ DST( 3) = ( (*in) >> 1 ) % (1U << 11 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 12 ) % (1U << 11 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 23 ) ;
+ ++in;
+ DST( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 );
+ DSI;
+ DST( 6) = ( (*in) >> 2 ) % (1U << 11 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 13 ) % (1U << 11 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 24 ) ;
+ ++in;
+ DST( 8) |= ((*in) % (1U<< 3 ))<<( 11 - 3 );
+ DSI;
+ DST( 9) = ( (*in) >> 3 ) % (1U << 11 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 14 ) % (1U << 11 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 25 ) ;
+ ++in;
+ DST(11) |= ((*in) % (1U<< 4 ))<<( 11 - 4 );
+ DSI;
+ DST(12) = ( (*in) >> 4 ) % (1U << 11 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 15 ) % (1U << 11 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(14) |= ((*in) % (1U<< 5 ))<<( 11 - 5 );
+ DSI;
+ DST(15) = ( (*in) >> 5 ) % (1U << 11 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 11 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 27 ) ;
+ ++in;
+ DST(17) |= ((*in) % (1U<< 6 ))<<( 11 - 6 );
+ DSI;
+ DST(18) = ( (*in) >> 6 ) % (1U << 11 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 17 ) % (1U << 11 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(20) |= ((*in) % (1U<< 7 ))<<( 11 - 7 );
+ DSI;
+ DST(21) = ( (*in) >> 7 ) % (1U << 11 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 18 ) % (1U << 11 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(23) |= ((*in) % (1U<< 8 ))<<( 11 - 8 );
+ DSI;
+ DST(24) = ( (*in) >> 8 ) % (1U << 11 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 19 ) % (1U << 11 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(26) |= ((*in) % (1U<< 9 ))<<( 11 - 9 );
+ DSI;
+ DST(27) = ( (*in) >> 9 ) % (1U << 11 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 20 ) % (1U << 11 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 31 ) ;
+ ++in;
+ DST(29) |= ((*in) % (1U<< 10 ))<<( 11 - 10 );
+ DSI;
+ DST(30) = ( (*in) >> 10 ) % (1U << 11 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 21 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack12_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 12 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 12 ) % (1U << 12 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 24 ) ;
+ ++in;
+ DST( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ DSI;
+ DST( 3) = ( (*in) >> 4 ) % (1U << 12 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 16 ) % (1U << 12 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 28 ) ;
+ ++in;
+ DST( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ DSI;
+ DST( 6) = ( (*in) >> 8 ) % (1U << 12 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 20 ) ;
+ ++in;
+ DSI;
+ DST( 8) = ( (*in) >> 0 ) % (1U << 12 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 12 ) % (1U << 12 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(10) |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ DSI;
+ DST(11) = ( (*in) >> 4 ) % (1U << 12 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 16 ) % (1U << 12 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(13) |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ DSI;
+ DST(14) = ( (*in) >> 8 ) % (1U << 12 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 20 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 12 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 12 ) % (1U << 12 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(18) |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ DSI;
+ DST(19) = ( (*in) >> 4 ) % (1U << 12 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 16 ) % (1U << 12 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(21) |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ DSI;
+ DST(22) = ( (*in) >> 8 ) % (1U << 12 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 20 ) ;
+ ++in;
+ DSI;
+ DST(24) = ( (*in) >> 0 ) % (1U << 12 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 12 ) % (1U << 12 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(26) |= ((*in) % (1U<< 4 ))<<( 12 - 4 );
+ DSI;
+ DST(27) = ( (*in) >> 4 ) % (1U << 12 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 16 ) % (1U << 12 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(29) |= ((*in) % (1U<< 8 ))<<( 12 - 8 );
+ DSI;
+ DST(30) = ( (*in) >> 8 ) % (1U << 12 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 20 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack13_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 13 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 13 ) % (1U << 13 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 26 ) ;
+ ++in;
+ DST( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 );
+ DSI;
+ DST( 3) = ( (*in) >> 7 ) % (1U << 13 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 20 ) ;
+ ++in;
+ DST( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 );
+ DSI;
+ DST( 5) = ( (*in) >> 1 ) % (1U << 13 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 14 ) % (1U << 13 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 27 ) ;
+ ++in;
+ DST( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 );
+ DSI;
+ DST( 8) = ( (*in) >> 8 ) % (1U << 13 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 21 ) ;
+ ++in;
+ DST( 9) |= ((*in) % (1U<< 2 ))<<( 13 - 2 );
+ DSI;
+ DST(10) = ( (*in) >> 2 ) % (1U << 13 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 15 ) % (1U << 13 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(12) |= ((*in) % (1U<< 9 ))<<( 13 - 9 );
+ DSI;
+ DST(13) = ( (*in) >> 9 ) % (1U << 13 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 22 ) ;
+ ++in;
+ DST(14) |= ((*in) % (1U<< 3 ))<<( 13 - 3 );
+ DSI;
+ DST(15) = ( (*in) >> 3 ) % (1U << 13 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 13 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(17) |= ((*in) % (1U<< 10 ))<<( 13 - 10 );
+ DSI;
+ DST(18) = ( (*in) >> 10 ) % (1U << 13 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 23 ) ;
+ ++in;
+ DST(19) |= ((*in) % (1U<< 4 ))<<( 13 - 4 );
+ DSI;
+ DST(20) = ( (*in) >> 4 ) % (1U << 13 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 17 ) % (1U << 13 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(22) |= ((*in) % (1U<< 11 ))<<( 13 - 11 );
+ DSI;
+ DST(23) = ( (*in) >> 11 ) % (1U << 13 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(24) |= ((*in) % (1U<< 5 ))<<( 13 - 5 );
+ DSI;
+ DST(25) = ( (*in) >> 5 ) % (1U << 13 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 18 ) % (1U << 13 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 31 ) ;
+ ++in;
+ DST(27) |= ((*in) % (1U<< 12 ))<<( 13 - 12 );
+ DSI;
+ DST(28) = ( (*in) >> 12 ) % (1U << 13 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 25 ) ;
+ ++in;
+ DST(29) |= ((*in) % (1U<< 6 ))<<( 13 - 6 );
+ DSI;
+ DST(30) = ( (*in) >> 6 ) % (1U << 13 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 19 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack14_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 14 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 14 ) % (1U << 14 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 28 ) ;
+ ++in;
+ DST( 2) |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+ DSI;
+ DST( 3) = ( (*in) >> 10 ) % (1U << 14 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 24 ) ;
+ ++in;
+ DST( 4) |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+ DSI;
+ DST( 5) = ( (*in) >> 6 ) % (1U << 14 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 20 ) ;
+ ++in;
+ DST( 6) |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+ DSI;
+ DST( 7) = ( (*in) >> 2 ) % (1U << 14 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 16 ) % (1U << 14 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 30 ) ;
+ ++in;
+ DST( 9) |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+ DSI;
+ DST(10) = ( (*in) >> 12 ) % (1U << 14 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(11) |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+ DSI;
+ DST(12) = ( (*in) >> 8 ) % (1U << 14 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 22 ) ;
+ ++in;
+ DST(13) |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+ DSI;
+ DST(14) = ( (*in) >> 4 ) % (1U << 14 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 18 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 14 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 14 ) % (1U << 14 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(18) |= ((*in) % (1U<< 10 ))<<( 14 - 10 );
+ DSI;
+ DST(19) = ( (*in) >> 10 ) % (1U << 14 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(20) |= ((*in) % (1U<< 6 ))<<( 14 - 6 );
+ DSI;
+ DST(21) = ( (*in) >> 6 ) % (1U << 14 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 20 ) ;
+ ++in;
+ DST(22) |= ((*in) % (1U<< 2 ))<<( 14 - 2 );
+ DSI;
+ DST(23) = ( (*in) >> 2 ) % (1U << 14 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 16 ) % (1U << 14 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(25) |= ((*in) % (1U<< 12 ))<<( 14 - 12 );
+ DSI;
+ DST(26) = ( (*in) >> 12 ) % (1U << 14 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(27) |= ((*in) % (1U<< 8 ))<<( 14 - 8 );
+ DSI;
+ DST(28) = ( (*in) >> 8 ) % (1U << 14 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 22 ) ;
+ ++in;
+ DST(29) |= ((*in) % (1U<< 4 ))<<( 14 - 4 );
+ DSI;
+ DST(30) = ( (*in) >> 4 ) % (1U << 14 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 18 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack15_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 15 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 15 ) % (1U << 15 ) ;
+ DSI;
+ DST( 2) = ( (*in) >> 30 ) ;
+ ++in;
+ DST( 2) |= ((*in) % (1U<< 13 ))<<( 15 - 13 );
+ DSI;
+ DST( 3) = ( (*in) >> 13 ) % (1U << 15 ) ;
+ DSI;
+ DST( 4) = ( (*in) >> 28 ) ;
+ ++in;
+ DST( 4) |= ((*in) % (1U<< 11 ))<<( 15 - 11 );
+ DSI;
+ DST( 5) = ( (*in) >> 11 ) % (1U << 15 ) ;
+ DSI;
+ DST( 6) = ( (*in) >> 26 ) ;
+ ++in;
+ DST( 6) |= ((*in) % (1U<< 9 ))<<( 15 - 9 );
+ DSI;
+ DST( 7) = ( (*in) >> 9 ) % (1U << 15 ) ;
+ DSI;
+ DST( 8) = ( (*in) >> 24 ) ;
+ ++in;
+ DST( 8) |= ((*in) % (1U<< 7 ))<<( 15 - 7 );
+ DSI;
+ DST( 9) = ( (*in) >> 7 ) % (1U << 15 ) ;
+ DSI;
+ DST(10) = ( (*in) >> 22 ) ;
+ ++in;
+ DST(10) |= ((*in) % (1U<< 5 ))<<( 15 - 5 );
+ DSI;
+ DST(11) = ( (*in) >> 5 ) % (1U << 15 ) ;
+ DSI;
+ DST(12) = ( (*in) >> 20 ) ;
+ ++in;
+ DST(12) |= ((*in) % (1U<< 3 ))<<( 15 - 3 );
+ DSI;
+ DST(13) = ( (*in) >> 3 ) % (1U << 15 ) ;
+ DSI;
+ DST(14) = ( (*in) >> 18 ) ;
+ ++in;
+ DST(14) |= ((*in) % (1U<< 1 ))<<( 15 - 1 );
+ DSI;
+ DST(15) = ( (*in) >> 1 ) % (1U << 15 ) ;
+ DSI;
+ DST(16) = ( (*in) >> 16 ) % (1U << 15 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 31 ) ;
+ ++in;
+ DST(17) |= ((*in) % (1U<< 14 ))<<( 15 - 14 );
+ DSI;
+ DST(18) = ( (*in) >> 14 ) % (1U << 15 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(19) |= ((*in) % (1U<< 12 ))<<( 15 - 12 );
+ DSI;
+ DST(20) = ( (*in) >> 12 ) % (1U << 15 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 27 ) ;
+ ++in;
+ DST(21) |= ((*in) % (1U<< 10 ))<<( 15 - 10 );
+ DSI;
+ DST(22) = ( (*in) >> 10 ) % (1U << 15 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 25 ) ;
+ ++in;
+ DST(23) |= ((*in) % (1U<< 8 ))<<( 15 - 8 );
+ DSI;
+ DST(24) = ( (*in) >> 8 ) % (1U << 15 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 23 ) ;
+ ++in;
+ DST(25) |= ((*in) % (1U<< 6 ))<<( 15 - 6 );
+ DSI;
+ DST(26) = ( (*in) >> 6 ) % (1U << 15 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 21 ) ;
+ ++in;
+ DST(27) |= ((*in) % (1U<< 4 ))<<( 15 - 4 );
+ DSI;
+ DST(28) = ( (*in) >> 4 ) % (1U << 15 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 19 ) ;
+ ++in;
+ DST(29) |= ((*in) % (1U<< 2 ))<<( 15 - 2 );
+ DSI;
+ DST(30) = ( (*in) >> 2 ) % (1U << 15 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 17 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack16_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST( 2) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST( 4) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST( 6) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST( 8) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(10) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(12) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(14) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(16) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(17) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(18) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(19) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(20) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(21) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(22) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(23) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(24) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(25) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(26) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(27) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(28) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(29) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+ DST(30) = ( (*in) >> 0 ) % (1U << 16 ) ;
+ DSI;
+ DST(31) = ( (*in) >> 16 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack17_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ DST( 0) = ( (*in) >> 0 ) % (1U << 17 ) ;
+ DSI;
+ DST( 1) = ( (*in) >> 17 ) ;
+ ++in;
+ DST( 1) |= ((*in) % (1U<< 2 ))<<( 17 - 2 );
+ DSI;
+ DST( 2) = ( (*in) >> 2 ) % (1U << 17 ) ;
+ DSI;
+ DST( 3) = ( (*in) >> 19 ) ;
+ ++in;
+ DST( 3) |= ((*in) % (1U<< 4 ))<<( 17 - 4 );
+ DSI;
+ DST( 4) = ( (*in) >> 4 ) % (1U << 17 ) ;
+ DSI;
+ DST( 5) = ( (*in) >> 21 ) ;
+ ++in;
+ DST( 5) |= ((*in) % (1U<< 6 ))<<( 17 - 6 );
+ DSI;
+ DST( 6) = ( (*in) >> 6 ) % (1U << 17 ) ;
+ DSI;
+ DST( 7) = ( (*in) >> 23 ) ;
+ ++in;
+ DST( 7) |= ((*in) % (1U<< 8 ))<<( 17 - 8 );
+ DSI;
+ DST( 8) = ( (*in) >> 8 ) % (1U << 17 ) ;
+ DSI;
+ DST( 9) = ( (*in) >> 25 ) ;
+ ++in;
+ DST( 9) |= ((*in) % (1U<< 10 ))<<( 17 - 10 );
+ DSI;
+ DST(10) = ( (*in) >> 10 ) % (1U << 17 ) ;
+ DSI;
+ DST(11) = ( (*in) >> 27 ) ;
+ ++in;
+ DST(11) |= ((*in) % (1U<< 12 ))<<( 17 - 12 );
+ DSI;
+ DST(12) = ( (*in) >> 12 ) % (1U << 17 ) ;
+ DSI;
+ DST(13) = ( (*in) >> 29 ) ;
+ ++in;
+ DST(13) |= ((*in) % (1U<< 14 ))<<( 17 - 14 );
+ DSI;
+ DST(14) = ( (*in) >> 14 ) % (1U << 17 ) ;
+ DSI;
+ DST(15) = ( (*in) >> 31 ) ;
+ ++in;
+ DST(15) |= ((*in) % (1U<< 16 ))<<( 17 - 16 );
+ DSI;
+ DST(16) = ( (*in) >> 16 ) ;
+ ++in;
+ DST(16) |= ((*in) % (1U<< 1 ))<<( 17 - 1 );
+ DSI;
+ DST(17) = ( (*in) >> 1 ) % (1U << 17 ) ;
+ DSI;
+ DST(18) = ( (*in) >> 18 ) ;
+ ++in;
+ DST(18) |= ((*in) % (1U<< 3 ))<<( 17 - 3 );
+ DSI;
+ DST(19) = ( (*in) >> 3 ) % (1U << 17 ) ;
+ DSI;
+ DST(20) = ( (*in) >> 20 ) ;
+ ++in;
+ DST(20) |= ((*in) % (1U<< 5 ))<<( 17 - 5 );
+ DSI;
+ DST(21) = ( (*in) >> 5 ) % (1U << 17 ) ;
+ DSI;
+ DST(22) = ( (*in) >> 22 ) ;
+ ++in;
+ DST(22) |= ((*in) % (1U<< 7 ))<<( 17 - 7 );
+ DSI;
+ DST(23) = ( (*in) >> 7 ) % (1U << 17 ) ;
+ DSI;
+ DST(24) = ( (*in) >> 24 ) ;
+ ++in;
+ DST(24) |= ((*in) % (1U<< 9 ))<<( 17 - 9 );
+ DSI;
+ DST(25) = ( (*in) >> 9 ) % (1U << 17 ) ;
+ DSI;
+ DST(26) = ( (*in) >> 26 ) ;
+ ++in;
+ DST(26) |= ((*in) % (1U<< 11 ))<<( 17 - 11 );
+ DSI;
+ DST(27) = ( (*in) >> 11 ) % (1U << 17 ) ;
+ DSI;
+ DST(28) = ( (*in) >> 28 ) ;
+ ++in;
+ DST(28) |= ((*in) % (1U<< 13 ))<<( 17 - 13 );
+ DSI;
+ DST(29) = ( (*in) >> 13 ) % (1U << 17 ) ;
+ DSI;
+ DST(30) = ( (*in) >> 30 ) ;
+ ++in;
+ DST(30) |= ((*in) % (1U<< 15 ))<<( 17 - 15 );
+ DSI;
+ DST(31) = ( (*in) >> 15 ) ;
+ ++in;
+ DSI;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack18_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 18 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack19_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 19 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack20_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 20 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack21_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 21 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack22_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 22 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack23_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 23 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack24_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 24 ) ;
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack25_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 25 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack26_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 26 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack27_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) % (1U << 27 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack28_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 28 ) ;
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack29_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) % (1U << 29 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 );
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack30_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 30 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) % (1U << 30 ) ;
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack31_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) % (1U << 31 ) ;
+ out++;
+ *out = ( (*in) >> 31 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 );
+ out++;
+ *out = ( (*in) >> 30 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 );
+ out++;
+ *out = ( (*in) >> 29 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 );
+ out++;
+ *out = ( (*in) >> 28 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 );
+ out++;
+ *out = ( (*in) >> 27 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 );
+ out++;
+ *out = ( (*in) >> 26 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 );
+ out++;
+ *out = ( (*in) >> 25 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 );
+ out++;
+ *out = ( (*in) >> 24 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 );
+ out++;
+ *out = ( (*in) >> 23 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 );
+ out++;
+ *out = ( (*in) >> 22 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 );
+ out++;
+ *out = ( (*in) >> 21 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 );
+ out++;
+ *out = ( (*in) >> 20 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 );
+ out++;
+ *out = ( (*in) >> 19 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 );
+ out++;
+ *out = ( (*in) >> 18 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 );
+ out++;
+ *out = ( (*in) >> 17 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 );
+ out++;
+ *out = ( (*in) >> 16 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 );
+ out++;
+ *out = ( (*in) >> 15 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 );
+ out++;
+ *out = ( (*in) >> 14 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 );
+ out++;
+ *out = ( (*in) >> 13 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 );
+ out++;
+ *out = ( (*in) >> 12 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 );
+ out++;
+ *out = ( (*in) >> 11 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 );
+ out++;
+ *out = ( (*in) >> 10 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 );
+ out++;
+ *out = ( (*in) >> 9 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 );
+ out++;
+ *out = ( (*in) >> 8 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 );
+ out++;
+ *out = ( (*in) >> 7 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 );
+ out++;
+ *out = ( (*in) >> 6 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 );
+ out++;
+ *out = ( (*in) >> 5 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 );
+ out++;
+ *out = ( (*in) >> 4 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 );
+ out++;
+ *out = ( (*in) >> 3 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 );
+ out++;
+ *out = ( (*in) >> 2 ) ;
+ ++in;
+ *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 );
+ out++;
+ *out = ( (*in) >> 1 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+
+const uint32_t * __fastunpack32_32(const uint32_t * __restrict in, uint32_t * __restrict out) {
+
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+ *out = ( (*in) >> 0 ) ;
+ ++in;
+ out++;
+
+ return in;
+ }
+
+
+
+ const uint32_t * fastunpack_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullunpacker32(in,out);
+
+ case 1:
+ return __fastunpack1_32(in,out);
+
+ case 2:
+ return __fastunpack2_32(in,out);
+
+ case 3:
+ return __fastunpack3_32(in,out);
+
+ case 4:
+ return __fastunpack4_32(in,out);
+
+ case 5:
+ return __fastunpack5_32(in,out);
+
+ case 6:
+ return __fastunpack6_32(in,out);
+
+ case 7:
+ return __fastunpack7_32(in,out);
+
+ case 8:
+ return __fastunpack8_32(in,out);
+
+ case 9:
+ return __fastunpack9_32(in,out);
+
+ case 10:
+ return __fastunpack10_32(in,out);
+
+ case 11:
+ return __fastunpack11_32(in,out);
+
+ case 12:
+ return __fastunpack12_32(in,out);
+
+ case 13:
+ return __fastunpack13_32(in,out);
+
+ case 14:
+ return __fastunpack14_32(in,out);
+
+ case 15:
+ return __fastunpack15_32(in,out);
+
+ case 16:
+ return __fastunpack16_32(in,out);
+
+ case 17:
+ return __fastunpack17_32(in,out);
+
+ case 18:
+ return __fastunpack18_32(in,out);
+
+ case 19:
+ return __fastunpack19_32(in,out);
+
+ case 20:
+ return __fastunpack20_32(in,out);
+
+ case 21:
+ return __fastunpack21_32(in,out);
+
+ case 22:
+ return __fastunpack22_32(in,out);
+
+ case 23:
+ return __fastunpack23_32(in,out);
+
+ case 24:
+ return __fastunpack24_32(in,out);
+
+ case 25:
+ return __fastunpack25_32(in,out);
+
+ case 26:
+ return __fastunpack26_32(in,out);
+
+ case 27:
+ return __fastunpack27_32(in,out);
+
+ case 28:
+ return __fastunpack28_32(in,out);
+
+ case 29:
+ return __fastunpack29_32(in,out);
+
+ case 30:
+ return __fastunpack30_32(in,out);
+
+ case 31:
+ return __fastunpack31_32(in,out);
+
+ case 32:
+ return __fastunpack32_32(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+ uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return nullpacker(in,out);
+
+ case 1:
+ return __fastpackwithoutmask1_32(in,out);
+
+ case 2:
+ return __fastpackwithoutmask2_32(in,out);
+
+ case 3:
+ return __fastpackwithoutmask3_32(in,out);
+
+ case 4:
+ return __fastpackwithoutmask4_32(in,out);
+
+ case 5:
+ return __fastpackwithoutmask5_32(in,out);
+
+ case 6:
+ return __fastpackwithoutmask6_32(in,out);
+
+ case 7:
+ return __fastpackwithoutmask7_32(in,out);
+
+ case 8:
+ return __fastpackwithoutmask8_32(in,out);
+
+ case 9:
+ return __fastpackwithoutmask9_32(in,out);
+
+ case 10:
+ return __fastpackwithoutmask10_32(in,out);
+
+ case 11:
+ return __fastpackwithoutmask11_32(in,out);
+
+ case 12:
+ return __fastpackwithoutmask12_32(in,out);
+
+ case 13:
+ return __fastpackwithoutmask13_32(in,out);
+
+ case 14:
+ return __fastpackwithoutmask14_32(in,out);
+
+ case 15:
+ return __fastpackwithoutmask15_32(in,out);
+
+ case 16:
+ return __fastpackwithoutmask16_32(in,out);
+
+ case 17:
+ return __fastpackwithoutmask17_32(in,out);
+
+ case 18:
+ return __fastpackwithoutmask18_32(in,out);
+
+ case 19:
+ return __fastpackwithoutmask19_32(in,out);
+
+ case 20:
+ return __fastpackwithoutmask20_32(in,out);
+
+ case 21:
+ return __fastpackwithoutmask21_32(in,out);
+
+ case 22:
+ return __fastpackwithoutmask22_32(in,out);
+
+ case 23:
+ return __fastpackwithoutmask23_32(in,out);
+
+ case 24:
+ return __fastpackwithoutmask24_32(in,out);
+
+ case 25:
+ return __fastpackwithoutmask25_32(in,out);
+
+ case 26:
+ return __fastpackwithoutmask26_32(in,out);
+
+ case 27:
+ return __fastpackwithoutmask27_32(in,out);
+
+ case 28:
+ return __fastpackwithoutmask28_32(in,out);
+
+ case 29:
+ return __fastpackwithoutmask29_32(in,out);
+
+ case 30:
+ return __fastpackwithoutmask30_32(in,out);
+
+ case 31:
+ return __fastpackwithoutmask31_32(in,out);
+
+ case 32:
+ return __fastpackwithoutmask32_32(in,out);
+
+ default:
+ break;
+ }
+ //throw logic_error("number of bits is unsupported");
+ }
+
diff --git a/ext/simdcomp/bitpacka.h b/ext/simdcomp/bitpacka.h
new file mode 100644
index 0000000..6fa76c8
--- /dev/null
+++ b/ext/simdcomp/bitpacka.h
@@ -0,0 +1,28 @@
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+#ifndef BITPACKINGALIGNED
+#define BITPACKINGALIGNED
+#include
+#include
+#include
+
+const uint32_t * fastunpack_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+
+const uint32_t * fastunpack_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+
+const uint32_t * fastunpack_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+
+const uint32_t * fastunpack_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+
+uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit);
+
+
+
+#endif // BITPACKINGALIGNED
diff --git a/ext/simdcomp/example.c b/ext/simdcomp/example.c
new file mode 100644
index 0000000..0394e20
--- /dev/null
+++ b/ext/simdcomp/example.c
@@ -0,0 +1,66 @@
+#include
+#include
+#include "simdcomp.h"
+
+
+// compresses data from datain to buffer, returns how many bytes written
+size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) {
+ if(length/SIMDBlockSize*SIMDBlockSize != length) {
+ printf("Data length should be a multiple of %i \n",SIMDBlockSize);
+ }
+ uint32_t offset = 0;
+ uint8_t * initout = buffer;
+ for(size_t k = 0; k < length / SIMDBlockSize; ++k) {
+ uint32_t b = simdmaxbitsd1(offset,
+ datain + k * SIMDBlockSize);
+ *buffer++ = b;
+ simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer,
+ b);
+ offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+ buffer += b * sizeof(__m128i);
+ }
+ return buffer - initout;
+}
+
+
+int main() {
+ int REPEAT = 5;
+ int N = 1000000 * SIMDBlockSize;//SIMDBlockSize is 128
+ uint32_t * datain = malloc(N * sizeof(uint32_t));
+ size_t compsize;
+ clock_t start, end;
+
+ uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); // output buffer
+ uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+ for (int gap = 1; gap <= 243; gap *= 3) {
+ printf("\n");
+ printf(" gap = %u \n", gap);
+ for (int k = 0; k < N; ++k)
+ datain[k] = k * gap;
+ uint32_t offset = 0;
+ compsize = compress(datain,N,buffer);
+ printf("compression rate = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 ));
+ start = clock();
+ uint32_t bogus = 0;
+ for(int repeat = 0; repeat < REPEAT; ++repeat) {
+ uint8_t * decbuffer = buffer;
+ for (int k = 0; k * SIMDBlockSize < N; ++k) {
+ uint8_t b = *decbuffer++;
+ simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b);
+ // do something here with backbuffer
+ bogus += backbuffer[3];
+ decbuffer += b * sizeof(__m128i);
+ offset = backbuffer[SIMDBlockSize - 1];
+ }
+ }
+ end = clock();
+ double numberofseconds = (end-start)/(double)CLOCKS_PER_SEC;
+ printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0));
+ printf("ignore me %i \n",bogus);
+ }
+ free(buffer);
+ free(datain);
+ free(backbuffer);
+ return 0;
+}
+
diff --git a/ext/simdcomp/include/simdbitpacking.h b/ext/simdcomp/include/simdbitpacking.h
new file mode 100644
index 0000000..301f4f5
--- /dev/null
+++ b/ext/simdcomp/include/simdbitpacking.h
@@ -0,0 +1,21 @@
+/**
+ * This code is released under a BSD License.
+ */
+#ifndef SIMDBITPACKING_H_
+#define SIMDBITPACKING_H_
+
+#include // SSE2 is required
+#include // use a C99-compliant compiler, please
+#include // for memset
+
+//reads 128 values from "in", writes "bit" 128-bit vectors to "out"
+void simdpack(const uint32_t * in,__m128i * out, uint32_t bit);
+
+//reads 128 values from "in", writes "bit" 128-bit vectors to "out"
+void simdpackwithoutmask(const uint32_t * in,__m128i * out, uint32_t bit);
+
+//reads "bit" 128-bit vectors from "in", writes 128 values to "out"
+void simdunpack(const __m128i * in,uint32_t * out, uint32_t bit);
+
+
+#endif /* SIMDBITPACKING_H_ */
diff --git a/ext/simdcomp/include/simdcomp.h b/ext/simdcomp/include/simdcomp.h
new file mode 100644
index 0000000..8875f0f
--- /dev/null
+++ b/ext/simdcomp/include/simdcomp.h
@@ -0,0 +1,12 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMP_H_
+#define SIMDCOMP_H_
+
+#include "simdbitpacking.h"
+#include "simdcomputil.h"
+#include "simdintegratedbitpacking.h"
+
+#endif
diff --git a/ext/simdcomp/include/simdcomputil.h b/ext/simdcomp/include/simdcomputil.h
new file mode 100644
index 0000000..107665b
--- /dev/null
+++ b/ext/simdcomp/include/simdcomputil.h
@@ -0,0 +1,29 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMDCOMPUTIL_H_
+#define SIMDCOMPUTIL_H_
+
+#include // SSE2 is required
+#include // use a C99-compliant compiler, please
+
+
+
+
+// returns the integer logarithm of v (bit width)
+uint32_t bits(const uint32_t v);
+
+// max integer logarithm over a range of SIMDBlockSize integers (128 integer)
+uint32_t maxbits(const uint32_t * begin);
+
+enum{ SIMDBlockSize = 128};
+
+// like maxbit over 128 integers (SIMDBlockSize) with provided initial value
+// and using differential coding
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in);
+
+
+
+
+#endif /* SIMDCOMPUTIL_H_ */
diff --git a/ext/simdcomp/include/simdintegratedbitpacking.h b/ext/simdcomp/include/simdintegratedbitpacking.h
new file mode 100644
index 0000000..18ca795
--- /dev/null
+++ b/ext/simdcomp/include/simdintegratedbitpacking.h
@@ -0,0 +1,27 @@
+/**
+ * This code is released under a BSD License.
+ */
+
+#ifndef SIMD_INTEGRATED_BITPACKING_H
+#define SIMD_INTEGRATED_BITPACKING_H
+
+#include // SSE2 is required
+#include // use a C99-compliant compiler, please
+
+#include "simdcomputil.h"
+
+//reads 128 values from "in", writes "bit" 128-bit vectors to "out"
+// integer values should be in sorted order (for best results)
+void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit);
+
+
+//reads 128 values from "in", writes "bit" 128-bit vectors to "out"
+// integer values should be in sorted order (for best results)
+void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit);
+
+
+//reads "bit" 128-bit vectors from "in", writes 128 values to "out"
+void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, uint32_t bit);
+
+
+#endif
diff --git a/ext/simdcomp/makefile b/ext/simdcomp/makefile
new file mode 100644
index 0000000..6ebd9d9
--- /dev/null
+++ b/ext/simdcomp/makefile
@@ -0,0 +1,54 @@
+# minimalist makefile
+.SUFFIXES:
+#
+.SUFFIXES: .cpp .o .c .h
+
+CFLAGS = -fPIC -std=c99 -O3 -Wall -Wextra -Wno-unused-parameter -pedantic
+LDFLAGS = -shared
+LIBNAME=libsimdcomp.so.0.0.3
+all: unit $(LIBNAME)
+test:
+ ./unit
+install: $(OBJECTS)
+ cp $(LIBNAME) /usr/local/lib
+ ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so
+ ldconfig
+ cp $(HEADERS) /usr/local/include
+
+
+
+HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h
+
+uninstall:
+ for h in $(HEADERS) ; do rm /usr/local/$$h; done
+ rm /usr/local/lib/$(LIBNAME)
+ rm /usr/local/lib/libsimdcomp.so
+ ldconfig
+
+
+OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o
+
+$(LIBNAME): $(OBJECTS)
+ $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS)
+
+
+
+simdcomputil.o: ./src/simdcomputil.c $(HEADERS)
+ $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude
+
+simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS)
+ $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude
+
+simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS)
+ $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude
+
+example: ./example.c $(HEADERS) $(OBJECTS)
+ $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS)
+
+unit: ./src/unit.c $(HEADERS) $(OBJECTS)
+ $(CC) $(CFLAGS) -o unit ./src/unit.c -Iinclude $(OBJECTS)
+dynunit: ./src/unit.c $(HEADERS) $(LIBNAME)
+ $(CC) $(CFLAGS) -o dynunit ./src/unit.c -Iinclude -lsimdcomp
+
+clean:
+ rm -f unit *.o $(LIBNAME)
diff --git a/ext/simdcomp/src/simdbitpacking.c b/ext/simdcomp/src/simdbitpacking.c
new file mode 100644
index 0000000..7137682
--- /dev/null
+++ b/ext/simdcomp/src/simdbitpacking.c
@@ -0,0 +1,14009 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include "../include/simdbitpacking.h"
+
+
+static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) {
+ (void) _in;
+ memset(out,0,32 * 4 * 4);
+}
+
+static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 19);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 19);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 21);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 19);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 23);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 21);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 19);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 26);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 21);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 23);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 25);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 26);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 23);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 28);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 25);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 19);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 27);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 21);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 30);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 29);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 28);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 27);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 26);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 25);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 24);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 23);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 22);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 21);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 20);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 19);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 18);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 17);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 16);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 15);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 14);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 13);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 12);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 11);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 10);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 9);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 8);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 7);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 6);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 5);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 4);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 3);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 2);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 1);
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg;
+ uint32_t outer;
+ for(outer=0; outer< 4 ;++outer) {
+ InReg = _mm_loadu_si128(in);
+ OutReg = InReg;
+
+ InReg = _mm_loadu_si128(in+1);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+
+ InReg = _mm_loadu_si128(in+2);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+
+ InReg = _mm_loadu_si128(in+3);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+
+ InReg = _mm_loadu_si128(in+4);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+
+ InReg = _mm_loadu_si128(in+5);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+
+ InReg = _mm_loadu_si128(in+6);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+
+ InReg = _mm_loadu_si128(in+7);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+
+ in+=8;
+ }
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg;
+ uint32_t outer;
+ for(outer=0; outer< 8 ;++outer) {
+ InReg = _mm_loadu_si128(in);
+ OutReg = InReg;
+
+ InReg = _mm_loadu_si128(in+1);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+
+ InReg = _mm_loadu_si128(in+2);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+
+ InReg = _mm_loadu_si128(in+3);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+
+ in+=4;
+ }
+
+}
+
+
+
+static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+ __m128i InReg;
+ uint32_t outer;
+ for(outer=0; outer< 16 ;++outer) {
+ InReg = _mm_loadu_si128(in);
+ OutReg = InReg;
+
+ InReg = _mm_loadu_si128(in+1);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+
+ in+=2;
+ }
+
+}
+
+
+
+static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<1)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<2)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<3)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<5)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<6)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<7)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<9)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<10)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<11)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<12)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<13)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<14)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<15)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<17)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<18)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<19)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<20)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<21)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 19);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<22)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<23)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 19);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 21);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<24)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<25)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 19);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 23);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 21);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<26)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<27)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 19);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 26);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 21);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 23);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 25);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<28)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<29)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 26);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 23);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 28);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 25);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 19);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 27);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 21);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<30)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32((1U<<31)-1);
+
+ __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 30);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 29);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 28);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 27);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 26);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 25);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 24);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 23);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 22);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 21);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 20);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 19);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 18);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 17);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 16);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 15);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 14);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 13);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 12);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 11);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 10);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 9);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 8);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 7);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 6);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 5);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 4);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 3);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 2);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 1);
+ InReg = _mm_and_si128(_mm_loadu_si128(++in), mask);
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+ __m128i InReg = _mm_loadu_si128(in);
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg, InReg;
+ const __m128i mask = _mm_set1_epi32((1U<<4)-1);
+
+ uint32_t outer;
+ for(outer=0; outer< 4 ;++outer) {
+ InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+
+ in+=8;
+ }
+
+}
+
+
+
+static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg, InReg;
+ const __m128i mask = _mm_set1_epi32((1U<<8)-1);
+
+ uint32_t outer;
+ for(outer=0; outer< 8 ;++outer) {
+ InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+
+ in+=4;
+ }
+
+}
+
+
+
+static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg, InReg;
+ const __m128i mask = _mm_set1_epi32((1U<<16)-1);
+
+ uint32_t outer;
+ for(outer=0; outer< 16 ;++outer) {
+ InReg = _mm_and_si128(_mm_loadu_si128(in), mask);
+ OutReg = InReg;
+
+ InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask);
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+ ++out;
+
+ in+=2;
+ }
+
+}
+
+
+
+
+static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) {
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg1 = _mm_loadu_si128(in);
+ __m128i InReg2 = InReg1;
+ __m128i OutReg1, OutReg2, OutReg3, OutReg4;
+ const __m128i mask = _mm_set1_epi32(1);
+
+ unsigned shift = 0;
+ unsigned i;
+ for (i = 0; i < 8; ++i) {
+ OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask);
+ OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask);
+ OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask);
+ OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask);
+ _mm_storeu_si128(out++, OutReg1);
+ _mm_storeu_si128(out++, OutReg2);
+ _mm_storeu_si128(out++, OutReg3);
+ _mm_storeu_si128(out++, OutReg4);
+ }
+}
+
+
+
+
+static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<2)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<3)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<4)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<5)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<6)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<7)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<8)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<9)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<10)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<11)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<12)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<13)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<14)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<15)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<16)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<17)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<18)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<19)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<20)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<21)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,11) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<22)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<23)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,11) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,9) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<24)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<25)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,11) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,9) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,7) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<26)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<27)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,7) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,9) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,11) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,5) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<28)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<29)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,11) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,5) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,7) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,9) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,3) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<30)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,2) ;
+ InReg = _mm_loadu_si128(++in);
+
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,2) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+
+
+static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ const __m128i mask = _mm_set1_epi32((1U<<31)-1);
+
+ OutReg = _mm_and_si128( InReg , mask);
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,31) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,30) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,29) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,28) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,27) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,26) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,25) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,24) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,23) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,22) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,21) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,20) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,19) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,18) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,17) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,16) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,15) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,14) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,13) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,12) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,11) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,10) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,9) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,8) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,7) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,6) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,5) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,4) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,3) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,2) ;
+ InReg = _mm_loadu_si128(++in);
+
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask));
+ _mm_storeu_si128(out++, OutReg);
+
+ OutReg = _mm_srli_epi32(InReg,1) ;
+ _mm_storeu_si128(out++, OutReg);
+
+
+}
+
+
+void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) {
+ __m128i* out = (__m128i*)(_out);
+ uint32_t outer;
+ for(outer=0; outer< 32 ;++outer) {
+ _mm_storeu_si128(out++, _mm_loadu_si128(in++));
+ }
+}
+
+
+
+void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) {
+ switch(bit) {
+ case 0: SIMD_nullunpacker32(in,out); return;
+
+ case 1: __SIMD_fastunpack1_32(in,out); return;
+
+ case 2: __SIMD_fastunpack2_32(in,out); return;
+
+ case 3: __SIMD_fastunpack3_32(in,out); return;
+
+ case 4: __SIMD_fastunpack4_32(in,out); return;
+
+ case 5: __SIMD_fastunpack5_32(in,out); return;
+
+ case 6: __SIMD_fastunpack6_32(in,out); return;
+
+ case 7: __SIMD_fastunpack7_32(in,out); return;
+
+ case 8: __SIMD_fastunpack8_32(in,out); return;
+
+ case 9: __SIMD_fastunpack9_32(in,out); return;
+
+ case 10: __SIMD_fastunpack10_32(in,out); return;
+
+ case 11: __SIMD_fastunpack11_32(in,out); return;
+
+ case 12: __SIMD_fastunpack12_32(in,out); return;
+
+ case 13: __SIMD_fastunpack13_32(in,out); return;
+
+ case 14: __SIMD_fastunpack14_32(in,out); return;
+
+ case 15: __SIMD_fastunpack15_32(in,out); return;
+
+ case 16: __SIMD_fastunpack16_32(in,out); return;
+
+ case 17: __SIMD_fastunpack17_32(in,out); return;
+
+ case 18: __SIMD_fastunpack18_32(in,out); return;
+
+ case 19: __SIMD_fastunpack19_32(in,out); return;
+
+ case 20: __SIMD_fastunpack20_32(in,out); return;
+
+ case 21: __SIMD_fastunpack21_32(in,out); return;
+
+ case 22: __SIMD_fastunpack22_32(in,out); return;
+
+ case 23: __SIMD_fastunpack23_32(in,out); return;
+
+ case 24: __SIMD_fastunpack24_32(in,out); return;
+
+ case 25: __SIMD_fastunpack25_32(in,out); return;
+
+ case 26: __SIMD_fastunpack26_32(in,out); return;
+
+ case 27: __SIMD_fastunpack27_32(in,out); return;
+
+ case 28: __SIMD_fastunpack28_32(in,out); return;
+
+ case 29: __SIMD_fastunpack29_32(in,out); return;
+
+ case 30: __SIMD_fastunpack30_32(in,out); return;
+
+ case 31: __SIMD_fastunpack31_32(in,out); return;
+
+ case 32: __SIMD_fastunpack32_32(in,out); return;
+
+ default: break;
+ }
+}
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return;
+
+ case 1: __SIMD_fastpackwithoutmask1_32(in,out); return;
+
+ case 2: __SIMD_fastpackwithoutmask2_32(in,out); return;
+
+ case 3: __SIMD_fastpackwithoutmask3_32(in,out); return;
+
+ case 4: __SIMD_fastpackwithoutmask4_32(in,out); return;
+
+ case 5: __SIMD_fastpackwithoutmask5_32(in,out); return;
+
+ case 6: __SIMD_fastpackwithoutmask6_32(in,out); return;
+
+ case 7: __SIMD_fastpackwithoutmask7_32(in,out); return;
+
+ case 8: __SIMD_fastpackwithoutmask8_32(in,out); return;
+
+ case 9: __SIMD_fastpackwithoutmask9_32(in,out); return;
+
+ case 10: __SIMD_fastpackwithoutmask10_32(in,out); return;
+
+ case 11: __SIMD_fastpackwithoutmask11_32(in,out); return;
+
+ case 12: __SIMD_fastpackwithoutmask12_32(in,out); return;
+
+ case 13: __SIMD_fastpackwithoutmask13_32(in,out); return;
+
+ case 14: __SIMD_fastpackwithoutmask14_32(in,out); return;
+
+ case 15: __SIMD_fastpackwithoutmask15_32(in,out); return;
+
+ case 16: __SIMD_fastpackwithoutmask16_32(in,out); return;
+
+ case 17: __SIMD_fastpackwithoutmask17_32(in,out); return;
+
+ case 18: __SIMD_fastpackwithoutmask18_32(in,out); return;
+
+ case 19: __SIMD_fastpackwithoutmask19_32(in,out); return;
+
+ case 20: __SIMD_fastpackwithoutmask20_32(in,out); return;
+
+ case 21: __SIMD_fastpackwithoutmask21_32(in,out); return;
+
+ case 22: __SIMD_fastpackwithoutmask22_32(in,out); return;
+
+ case 23: __SIMD_fastpackwithoutmask23_32(in,out); return;
+
+ case 24: __SIMD_fastpackwithoutmask24_32(in,out); return;
+
+ case 25: __SIMD_fastpackwithoutmask25_32(in,out); return;
+
+ case 26: __SIMD_fastpackwithoutmask26_32(in,out); return;
+
+ case 27: __SIMD_fastpackwithoutmask27_32(in,out); return;
+
+ case 28: __SIMD_fastpackwithoutmask28_32(in,out); return;
+
+ case 29: __SIMD_fastpackwithoutmask29_32(in,out); return;
+
+ case 30: __SIMD_fastpackwithoutmask30_32(in,out); return;
+
+ case 31: __SIMD_fastpackwithoutmask31_32(in,out); return;
+
+ case 32: __SIMD_fastpackwithoutmask32_32(in,out); return;
+
+ default: break;
+ }
+}
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) {
+ switch(bit) {
+ case 0: return;
+
+ case 1: __SIMD_fastpack1_32(in,out); return;
+
+ case 2: __SIMD_fastpack2_32(in,out); return;
+
+ case 3: __SIMD_fastpack3_32(in,out); return;
+
+ case 4: __SIMD_fastpack4_32(in,out); return;
+
+ case 5: __SIMD_fastpack5_32(in,out); return;
+
+ case 6: __SIMD_fastpack6_32(in,out); return;
+
+ case 7: __SIMD_fastpack7_32(in,out); return;
+
+ case 8: __SIMD_fastpack8_32(in,out); return;
+
+ case 9: __SIMD_fastpack9_32(in,out); return;
+
+ case 10: __SIMD_fastpack10_32(in,out); return;
+
+ case 11: __SIMD_fastpack11_32(in,out); return;
+
+ case 12: __SIMD_fastpack12_32(in,out); return;
+
+ case 13: __SIMD_fastpack13_32(in,out); return;
+
+ case 14: __SIMD_fastpack14_32(in,out); return;
+
+ case 15: __SIMD_fastpack15_32(in,out); return;
+
+ case 16: __SIMD_fastpack16_32(in,out); return;
+
+ case 17: __SIMD_fastpack17_32(in,out); return;
+
+ case 18: __SIMD_fastpack18_32(in,out); return;
+
+ case 19: __SIMD_fastpack19_32(in,out); return;
+
+ case 20: __SIMD_fastpack20_32(in,out); return;
+
+ case 21: __SIMD_fastpack21_32(in,out); return;
+
+ case 22: __SIMD_fastpack22_32(in,out); return;
+
+ case 23: __SIMD_fastpack23_32(in,out); return;
+
+ case 24: __SIMD_fastpack24_32(in,out); return;
+
+ case 25: __SIMD_fastpack25_32(in,out); return;
+
+ case 26: __SIMD_fastpack26_32(in,out); return;
+
+ case 27: __SIMD_fastpack27_32(in,out); return;
+
+ case 28: __SIMD_fastpack28_32(in,out); return;
+
+ case 29: __SIMD_fastpack29_32(in,out); return;
+
+ case 30: __SIMD_fastpack30_32(in,out); return;
+
+ case 31: __SIMD_fastpack31_32(in,out); return;
+
+ case 32: __SIMD_fastpack32_32(in,out); return;
+
+ default: break;
+ }
+}
+
+
+
diff --git a/ext/simdcomp/src/simdcomputil.c b/ext/simdcomp/src/simdcomputil.c
new file mode 100644
index 0000000..9b36da5
--- /dev/null
+++ b/ext/simdcomp/src/simdcomputil.c
@@ -0,0 +1,56 @@
+#include "../include/simdcomputil.h"
+
+__attribute__((always_inline))
+static inline __m128i Delta(__m128i curr, __m128i prev) {
+ return _mm_sub_epi32(curr,
+ _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)));
+}
+
+
+// returns the integer logarithm of v (bit width)
+uint32_t bits(const uint32_t v) {
+#ifdef _MSC_VER
+ if (v == 0) {
+ return 0;
+ }
+ unsigned long answer;
+ _BitScanReverse(&answer, v);
+ return answer + 1;
+#else
+ return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft
+#endif
+}
+
+__attribute__ ((pure))
+uint32_t maxbits(const uint32_t * begin) {
+ uint32_t accumulator = 0;const uint32_t * k;
+ for (k = begin; k != begin + SIMDBlockSize; ++k) {
+ accumulator |= *k;
+ }
+ return bits(accumulator);
+}
+
+static uint32_t maxbitas32int(const __m128i accumulator) {
+ uint32_t tmparray[4];
+ _mm_storeu_si128((__m128i *) (tmparray), accumulator);
+ return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]);
+}
+
+
+// maxbit over 128 integers (SIMDBlockSize) with provided initial value
+uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) {
+ __m128i initoffset = _mm_set1_epi32 (initvalue);
+ const __m128i* pin = (const __m128i*)(in);
+ __m128i newvec = _mm_loadu_si128(pin);
+ __m128i accumulator = Delta(newvec , initoffset);
+ __m128i oldvec = newvec;
+ uint32_t k;
+ for(k = 1; 4*k < SIMDBlockSize; ++k) {
+ newvec = _mm_loadu_si128(pin+k);
+ accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec));
+ oldvec = newvec;
+ }
+ initoffset = oldvec;
+ return maxbitas32int(accumulator);
+}
+
diff --git a/ext/simdcomp/src/simdintegratedbitpacking.c b/ext/simdcomp/src/simdintegratedbitpacking.c
new file mode 100644
index 0000000..951bb85
--- /dev/null
+++ b/ext/simdcomp/src/simdintegratedbitpacking.c
@@ -0,0 +1,24872 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include "../include/simdintegratedbitpacking.h"
+
+__attribute__((always_inline))
+static inline __m128i Delta(__m128i curr, __m128i prev) {
+ return _mm_sub_epi32(curr,
+ _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12)));
+}
+
+__attribute__((always_inline))
+static inline __m128i PrefixSum(__m128i curr, __m128i prev) {
+ const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr);
+ const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1);
+ return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff));
+}
+
+
+__m128i iunpack0(__m128i initOffset, const __m128i * _in , uint32_t * _out) {
+ (void) _in;
+ __m128i *out = (__m128i*)(_out);
+ const __m128i zero = _mm_set1_epi32 (0);
+ unsigned i;
+ for (i = 0; i < 8; ++i) {
+ initOffset = PrefixSum(zero, initOffset);
+ _mm_storeu_si128(out++, initOffset);
+ initOffset = PrefixSum(zero, initOffset);
+ _mm_storeu_si128(out++, initOffset);
+ initOffset = PrefixSum(zero, initOffset);
+ _mm_storeu_si128(out++, initOffset);
+ initOffset = PrefixSum(zero, initOffset);
+ _mm_storeu_si128(out++, initOffset);
+ }
+
+ return initOffset;
+}
+
+
+
+
+void ipackwithoutmask0(__m128i initOffset , const uint32_t * _in , __m128i * out) {
+ (void) initOffset;
+ (void) _in;
+ (void) out;
+}
+
+
+void ipack0(__m128i initOffset , const uint32_t * _in , __m128i * out ) {
+ (void) initOffset;
+ (void) _in;
+ (void) out;
+}
+
+
+
+void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(1U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(3U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(7U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 3 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(15U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(31U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 5 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(63U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 6 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(127U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 7 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(255U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(511U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 9 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(1023U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 10 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(2047U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 11 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(4095U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 12 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(8191U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 13 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(16383U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 14 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(32767U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 15 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(65535U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(131071U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 17 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(262143U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 18 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(524287U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 19 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(1048575U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 20 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(2097151U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 21 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(4194303U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 22 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(8388607U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 23 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(16777215U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 24 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(33554431U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 25 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(67108863U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 26 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 25);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(134217727U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 25);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 27 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(268435455U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 28 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 25);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 27);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(536870911U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 25);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 27);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 29 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(1073741823U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 30 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 30);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 29);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 27);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 25);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = Delta(CurrIn, initOffset);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ __m128i OutReg;
+
+
+ const __m128i mask = _mm_set1_epi32(2147483647U); ;
+
+ __m128i CurrIn = _mm_loadu_si128(in);
+ __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+ OutReg = InReg;
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 30);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 29);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 28);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 27);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 26);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 25);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 24);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 23);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 22);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 21);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 20);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 19);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 18);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 17);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 16);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 15);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 14);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 13);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 12);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 11);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 10);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 9);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 8);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 7);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 6);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 5);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 4);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 3);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 2);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2));
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ OutReg = _mm_srli_epi32(InReg, 31 - 1);
+ ++in;
+ CurrIn = _mm_loadu_si128(in);
+ InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask);
+ initOffset = CurrIn;
+
+ OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1));
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipackwithoutmask32(__m128i initOffset , const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ (void) initOffset;
+ __m128i OutReg;
+
+
+ __m128i InReg = _mm_loadu_si128(in);
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+void ipack32(__m128i initOffset , const uint32_t * _in, __m128i * out) {
+ const __m128i *in = (const __m128i*)(_in);
+ (void) initOffset;
+ __m128i OutReg;
+
+
+
+ __m128i InReg = _mm_loadu_si128(in);
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+ ++out;
+ ++in;
+ InReg = _mm_loadu_si128(in);
+
+ OutReg = InReg;
+ _mm_storeu_si128(out, OutReg);
+
+
+}
+
+
+
+
+
+__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<1)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<2)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<3)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<4)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<5)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<6)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<7)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<8)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<9)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<10)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<11)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<12)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<13)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<14)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<15)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<16)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<17)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<18)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<19)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<20)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<21)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<22)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<23)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<24)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<25)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<26)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<27)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<28)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<29)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<30)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+
+__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+
+ __m128i* out = (__m128i*)(_out);
+ __m128i InReg = _mm_loadu_si128(in);
+ __m128i OutReg;
+ __m128i tmp;
+ __m128i mask = _mm_set1_epi32((1U<<31)-1);
+
+
+
+ tmp = InReg;
+ OutReg = _mm_and_si128(tmp, mask);
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,31);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,30);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,29);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,28);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,27);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,26);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,25);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,24);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,23);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,22);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,21);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,20);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,19);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,18);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,17);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,16);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,15);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,14);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,13);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,12);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,11);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,10);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,9);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,8);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,7);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,6);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,5);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,4);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,3);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,2);
+ OutReg = tmp;
+ ++in; InReg = _mm_loadu_si128(in);
+ OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask));
+
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+ tmp = _mm_srli_epi32(InReg,1);
+ OutReg = tmp;
+ OutReg = PrefixSum(OutReg, initOffset);
+ initOffset = OutReg;
+ _mm_storeu_si128(out++, OutReg);
+
+
+ return initOffset;
+
+}
+
+
+
+
+__m128i iunpack32(__m128i initOffset, const __m128i* in, uint32_t * _out) {
+ (void) initOffset;
+ __m128i * mout = (__m128i *)(_out);
+ __m128i invec;
+ size_t k;
+ for(k = 0; k < 128/4; ++k) {
+ invec = _mm_loadu_si128(in++);
+ _mm_storeu_si128(mout++, invec);
+ }
+ return invec;
+}
+
+
+
+
+ void simdunpackd1(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) {
+ __m128i initOffset = _mm_set1_epi32 (initvalue);
+ switch(bit) {
+ case 0: iunpack0(initOffset,in,out); break;
+
+ case 1: iunpack1(initOffset,in,out); break;
+
+ case 2: iunpack2(initOffset,in,out); break;
+
+ case 3: iunpack3(initOffset,in,out); break;
+
+ case 4: iunpack4(initOffset,in,out); break;
+
+ case 5: iunpack5(initOffset,in,out); break;
+
+ case 6: iunpack6(initOffset,in,out); break;
+
+ case 7: iunpack7(initOffset,in,out); break;
+
+ case 8: iunpack8(initOffset,in,out); break;
+
+ case 9: iunpack9(initOffset,in,out); break;
+
+ case 10: iunpack10(initOffset,in,out); break;
+
+ case 11: iunpack11(initOffset,in,out); break;
+
+ case 12: iunpack12(initOffset,in,out); break;
+
+ case 13: iunpack13(initOffset,in,out); break;
+
+ case 14: iunpack14(initOffset,in,out); break;
+
+ case 15: iunpack15(initOffset,in,out); break;
+
+ case 16: iunpack16(initOffset,in,out); break;
+
+ case 17: iunpack17(initOffset,in,out); break;
+
+ case 18: iunpack18(initOffset,in,out); break;
+
+ case 19: iunpack19(initOffset,in,out); break;
+
+ case 20: iunpack20(initOffset,in,out); break;
+
+ case 21: iunpack21(initOffset,in,out); break;
+
+ case 22: iunpack22(initOffset,in,out); break;
+
+ case 23: iunpack23(initOffset,in,out); break;
+
+ case 24: iunpack24(initOffset,in,out); break;
+
+ case 25: iunpack25(initOffset,in,out); break;
+
+ case 26: iunpack26(initOffset,in,out); break;
+
+ case 27: iunpack27(initOffset,in,out); break;
+
+ case 28: iunpack28(initOffset,in,out); break;
+
+ case 29: iunpack29(initOffset,in,out); break;
+
+ case 30: iunpack30(initOffset,in,out); break;
+
+ case 31: iunpack31(initOffset,in,out); break;
+
+ case 32: iunpack32(initOffset,in,out); break;
+
+ default: break;
+ }
+}
+
+
+
+ /*assumes that integers fit in the prescribed number of bits*/
+
+void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) {
+ __m128i initOffset = _mm_set1_epi32 (initvalue);
+ switch(bit) {
+ case 0: break;
+
+ case 1: ipackwithoutmask1(initOffset,in,out); break;
+
+ case 2: ipackwithoutmask2(initOffset,in,out); break;
+
+ case 3: ipackwithoutmask3(initOffset,in,out); break;
+
+ case 4: ipackwithoutmask4(initOffset,in,out); break;
+
+ case 5: ipackwithoutmask5(initOffset,in,out); break;
+
+ case 6: ipackwithoutmask6(initOffset,in,out); break;
+
+ case 7: ipackwithoutmask7(initOffset,in,out); break;
+
+ case 8: ipackwithoutmask8(initOffset,in,out); break;
+
+ case 9: ipackwithoutmask9(initOffset,in,out); break;
+
+ case 10: ipackwithoutmask10(initOffset,in,out); break;
+
+ case 11: ipackwithoutmask11(initOffset,in,out); break;
+
+ case 12: ipackwithoutmask12(initOffset,in,out); break;
+
+ case 13: ipackwithoutmask13(initOffset,in,out); break;
+
+ case 14: ipackwithoutmask14(initOffset,in,out); break;
+
+ case 15: ipackwithoutmask15(initOffset,in,out); break;
+
+ case 16: ipackwithoutmask16(initOffset,in,out); break;
+
+ case 17: ipackwithoutmask17(initOffset,in,out); break;
+
+ case 18: ipackwithoutmask18(initOffset,in,out); break;
+
+ case 19: ipackwithoutmask19(initOffset,in,out); break;
+
+ case 20: ipackwithoutmask20(initOffset,in,out); break;
+
+ case 21: ipackwithoutmask21(initOffset,in,out); break;
+
+ case 22: ipackwithoutmask22(initOffset,in,out); break;
+
+ case 23: ipackwithoutmask23(initOffset,in,out); break;
+
+ case 24: ipackwithoutmask24(initOffset,in,out); break;
+
+ case 25: ipackwithoutmask25(initOffset,in,out); break;
+
+ case 26: ipackwithoutmask26(initOffset,in,out); break;
+
+ case 27: ipackwithoutmask27(initOffset,in,out); break;
+
+ case 28: ipackwithoutmask28(initOffset,in,out); break;
+
+ case 29: ipackwithoutmask29(initOffset,in,out); break;
+
+ case 30: ipackwithoutmask30(initOffset,in,out); break;
+
+ case 31: ipackwithoutmask31(initOffset,in,out); break;
+
+ case 32: ipackwithoutmask32(initOffset,in,out); break;
+
+ default: break;
+ }
+}
+
+
+
+
+void simdpackd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) {
+ __m128i initOffset = _mm_set1_epi32 (initvalue);
+ switch(bit) {
+ case 0: break;;
+
+ case 1: ipack1(initOffset, in,out); break;
+
+ case 2: ipack2(initOffset, in,out); break;
+
+ case 3: ipack3(initOffset, in,out); break;
+
+ case 4: ipack4(initOffset, in,out); break;
+
+ case 5: ipack5(initOffset, in,out); break;
+
+ case 6: ipack6(initOffset, in,out); break;
+
+ case 7: ipack7(initOffset, in,out); break;
+
+ case 8: ipack8(initOffset, in,out); break;
+
+ case 9: ipack9(initOffset, in,out); break;
+
+ case 10: ipack10(initOffset, in,out); break;
+
+ case 11: ipack11(initOffset, in,out); break;
+
+ case 12: ipack12(initOffset, in,out); break;
+
+ case 13: ipack13(initOffset, in,out); break;
+
+ case 14: ipack14(initOffset, in,out); break;
+
+ case 15: ipack15(initOffset, in,out); break;
+
+ case 16: ipack16(initOffset, in,out); break;
+
+ case 17: ipack17(initOffset, in,out); break;
+
+ case 18: ipack18(initOffset, in,out); break;
+
+ case 19: ipack19(initOffset, in,out); break;
+
+ case 20: ipack20(initOffset, in,out); break;
+
+ case 21: ipack21(initOffset, in,out); break;
+
+ case 22: ipack22(initOffset, in,out); break;
+
+ case 23: ipack23(initOffset, in,out); break;
+
+ case 24: ipack24(initOffset, in,out); break;
+
+ case 25: ipack25(initOffset, in,out); break;
+
+ case 26: ipack26(initOffset, in,out); break;
+
+ case 27: ipack27(initOffset, in,out); break;
+
+ case 28: ipack28(initOffset, in,out); break;
+
+ case 29: ipack29(initOffset, in,out); break;
+
+ case 30: ipack30(initOffset, in,out); break;
+
+ case 31: ipack31(initOffset, in,out); break;
+
+ case 32: ipack32(initOffset, in,out); break;
+
+ default: break;
+ }
+}
+
diff --git a/ext/simdcomp/src/unit.c b/ext/simdcomp/src/unit.c
new file mode 100644
index 0000000..826f447
--- /dev/null
+++ b/ext/simdcomp/src/unit.c
@@ -0,0 +1,63 @@
+/**
+ * This code is released under a BSD License.
+ */
+#include
+#include
+#include "simdcomp.h"
+
+
+int main() {
+ int N = 5000 * SIMDBlockSize;
+ __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+ uint32_t * datain = malloc(N * sizeof(uint32_t));
+ uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t));
+ for (int gap = 1; gap <= 387420489; gap *= 3) {
+ printf(" gap = %u \n", gap);
+ for (int k = 0; k < N; ++k)
+ datain[k] = k * gap;
+ uint32_t offset = 0;
+ for (int k = 0; k * SIMDBlockSize < N; ++k) {
+ /////////////////////////////
+ // First part works for general arrays (sorted or unsorted)
+ /////////////////////////////
+ // we compute the bit width
+ const uint32_t b = maxbits(datain + k * SIMDBlockSize);
+ // we read 128 integers at "datain + k * SIMDBlockSize" and
+ // write b 128-bit vectors at "buffer"
+ simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b);
+ // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer
+ simdunpack(buffer, backbuffer, b);//uncompressed
+ for (int j = 0; j < SIMDBlockSize; ++j) {
+ if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+ printf("bug in simdpack\n");
+ return -2;
+ }
+ }
+ /////////////////////////////
+ // next part assumes that the data is sorted (uses differential coding)
+ /////////////////////////////
+ // we compute the bit width
+ const uint32_t b1 = simdmaxbitsd1(offset,
+ datain + k * SIMDBlockSize);
+ // we read 128 integers at "datain + k * SIMDBlockSize" and
+ // write b1 128-bit vectors at "buffer"
+ simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer,
+ b1);
+ // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer
+ simdunpackd1(offset, buffer, backbuffer, b1);
+ for (int j = 0; j < SIMDBlockSize; ++j) {
+ if (backbuffer[j] != datain[k * SIMDBlockSize + j]) {
+ printf("bug in simdpack d1\n");
+ return -3;
+ }
+ }
+ offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1];
+
+ }
+ }
+ free(buffer);
+ free(datain);
+ free(backbuffer);
+ printf("Code looks good.\n");
+ return 0;
+}
diff --git a/ext/simple8b.c b/ext/simple8b.c
new file mode 100644
index 0000000..3ac5615
--- /dev/null
+++ b/ext/simple8b.c
@@ -0,0 +1,330 @@
+// modified and optimized (speed + compression) by powturbo
+// 64 bits version from: Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words.
+// Softw., Pract. Exper. 40(2): 131-147 (2010)
+// http://ww2.cs.mu.oz.au/~alistair/coders-64bit/
+
+ #if defined(__x86_64__) || defined(__x86_32__)
+static inline int bsr32(int x) {
+ int b = -1;
+ asm("bsrl %1,%0" : "+r" (b): "rm" (x) );
+ return b + 1;
+}
+ #else
+static inline int bsr32(int x) {
+ return x?32 - __builtin_clz(x):0;
+}
+ #endif
+
+#define WPUT(__x,__bit) { __bw |= (unsigned long long)(__x)<<__br; __br += __bit; }
+#define WPUTZERO(__sel) { __bw = __br = 0; WPUT(__sel,4); }
+#define WPUTFLUSH(__out) { *(typeof(__bw) *)__out = __bw; __out += sizeof(__bw)/sizeof(__out[0]); }
+
+#if 0 //WORD_SIZE==32
+ #define CODE_TABLE \
+ unsigned char sel2bit[]= { 0, 0, 0, 0, 0, 0, 0, 1 ,2,3,4,5,7,9,14,28}; \
+ unsigned sel2elems[]= {256,120,90,60,50,40,32,28,14,9,7,5,4,3, 2, 1}; \
+
+ #define BIT_2_SEL \
+ char bit2sel[]= { 0,7,8,9,10,11,12,12,13,13,14,14,14,14,14, \
+ 15,15,15,15,15,15,15,15,15,15,15,15,15,15, \
+ -1,-1,-1,-1};
+ #define MAX_BIT 28
+#else
+#define CODE_TABLE \
+ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ \
+unsigned char sel2bit[]= { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60,61}; \
+unsigned sel2elems[]= {256,120,60,30,20,15,12,10, 8, 7, 6, 5, 4, 3, 2, 1}; \
+unsigned sellim[]= {256,120,60,60,60,60,60,60,56,56, 60, 60, 60, 60, 60, 60};
+
+#define BIT_2_SEL char bit2sel[]= \
+ {0,2,3,4,5,6,7,8, 9,10,10,11,11,12,12,12, \
+ 13,13,13,13,13,14,14,14, 14,14,14,14,14,14,14,15, \
+ 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, \
+ 15,15,15,15,15,15,15,15, 15,15,15,15,15,-1, -1, -1, -1};
+
+ #define MAX_BIT 60
+#endif
+
+CODE_TABLE
+BIT_2_SEL
+
+unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out) {
+ unsigned long long __bw; unsigned __br = 0;
+ unsigned char bits[0x1000];
+ int elems;
+ int i,j;
+ for (i = 0; i < n; i++)
+ bits[i] = bsr32(in[i])+1; //CalcMinBits(in, bits, n);
+ int sel, bit,tmp; /*BLK_ENC_ADJUST*/
+ for (i=0; i bit) {
+ tmp = bit2sel[bits[j]] ;
+ if(elems < sel2elems[ tmp ]) {
+ sel = tmp;
+ bit= sel2bit[sel];
+ } else {
+ while ( elems < sel2elems[sel] ) sel++;
+ elems = sel2elems[sel];
+ bit = sel2bit[sel];
+ break;
+ }
+ }
+ elems++;
+ }
+ if (bit == 0) { /* can be downgrade to bit=1 */
+ if (i+elems elems; sel++);
+ elems = sel2elems[sel];
+ bit = sel2bit[sel];
+ } else sel = 0; /* what a waste! */
+ } else {
+ sel = bit2sel[bit];
+ bit = sel2bit[sel];
+ }
+ WPUTZERO(sel);
+ if (bit) {
+ for ( ; elems ; elems--, i++) WPUT(in[i],bit);
+ } else
+ i += elems;
+ WPUTFLUSH(out);
+ }
+ return out;
+}
+
+#define MSK(__x) ((1ul<<__x)-1)
+unsigned char *vs8bdec(unsigned char *__restrict in, int n, unsigned *__restrict out) {
+ unsigned char *ip = in;
+ unsigned i,*_out = out,*out_ = out+n;
+ while(out < out_) {
+ unsigned long long w = *(unsigned long long *)ip;
+ switch(w & 15) {
+ #if 1
+ case 0: ip+=8; for(i=0; i<256; i++) out[i]= 1; out += 256; break;
+ #else
+ case 0: { int r = (w>>4)&0xf; ip++; if(r == 0xf) { r = (w>>8)&0xff; ip++; } while(r-->=0) *out++=0; } break;
+ #endif
+
+ case 1: ip+=8;
+ for(i=0; i<120; i++) out[i]= 1; out += 120;
+ break;
+ case 2: ip+=8;
+ out[ 0]= (w >> 4) & MSK(1);
+ out[ 1]= (w >> 5) & MSK(1);
+ out[ 2]= (w >> 6) & MSK(1);
+ out[ 3]= (w >> 7) & MSK(1);
+ out[ 4]= (w >> 8) & MSK(1);
+ out[ 5]= (w >> 9) & MSK(1);
+ out[ 6]= (w >> 10) & MSK(1);
+ out[ 7]= (w >> 11) & MSK(1);
+ out[ 8]= (w >> 12) & MSK(1);
+ out[ 9]= (w >> 13) & MSK(1);
+ out[10]= (w >> 14) & MSK(1);
+ out[11]= (w >> 15) & MSK(1);
+ out[12]= (w >> 16) & MSK(1);
+ out[13]= (w >> 17) & MSK(1);
+ out[14]= (w >> 18) & MSK(1);
+ out[15]= (w >> 19) & MSK(1);
+ out[16]= (w >> 20) & MSK(1);
+ out[17]= (w >> 21) & MSK(1);
+ out[18]= (w >> 22) & MSK(1);
+ out[19]= (w >> 23) & MSK(1);
+ out[20]= (w >> 24) & MSK(1);
+ out[21]= (w >> 25) & MSK(1);
+ out[22]= (w >> 26) & MSK(1);
+ out[23]= (w >> 27) & MSK(1);
+ out[24]= (w >> 28) & MSK(1);
+ out[25]= (w >> 29) & MSK(1);
+ out[26]= (w >> 30) & MSK(1);
+ out[27]= (w >> 31) & MSK(1);
+ out[28]= (w >> 32) & MSK(1);
+ out[29]= (w >> 33) & MSK(1);
+ out[30]= (w >> 34) & MSK(1);
+ out[31]= (w >> 35) & MSK(1);
+ out[32]= (w >> 36) & MSK(1);
+ out[33]= (w >> 37) & MSK(1);
+ out[34]= (w >> 38) & MSK(1);
+ out[35]= (w >> 39) & MSK(1);
+ out[36]= (w >> 40) & MSK(1);
+ out[37]= (w >> 41) & MSK(1);
+ out[38]= (w >> 42) & MSK(1);
+ out[39]= (w >> 43) & MSK(1);
+ out[40]= (w >> 44) & MSK(1);
+ out[41]= (w >> 45) & MSK(1);
+ out[42]= (w >> 46) & MSK(1);
+ out[43]= (w >> 47) & MSK(1);
+ out[44]= (w >> 48) & MSK(1);
+ out[45]= (w >> 49) & MSK(1);
+ out[46]= (w >> 50) & MSK(1);
+ out[47]= (w >> 51) & MSK(1);
+ out[48]= (w >> 52) & MSK(1);
+ out[49]= (w >> 53) & MSK(1);
+ out[50]= (w >> 54) & MSK(1);
+ out[51]= (w >> 55) & MSK(1);
+ out[52]= (w >> 56) & MSK(1);
+ out[53]= (w >> 57) & MSK(1);
+ out[54]= (w >> 58) & MSK(1);
+ out[55]= (w >> 59) & MSK(1);
+ out[56]= (w >> 60) & MSK(1);
+ out[57]= (w >> 61) & MSK(1);
+ out[58]= (w >> 62) & MSK(1);
+ out[59]= (w >> 63) & MSK(1); out += 60;
+ break;
+ case 3: ip+=8;
+ out[ 0]= (w >> 4) & MSK(2);
+ out[ 1]= (w >> 6) & MSK(2);
+ out[ 2]= (w >> 8) & MSK(2);
+ out[ 3]= (w >> 10) & MSK(2);
+ out[ 4]= (w >> 12) & MSK(2);
+ out[ 5]= (w >> 14) & MSK(2);
+ out[ 6]= (w >> 16) & MSK(2);
+ out[ 7]= (w >> 18) & MSK(2);
+ out[ 8]= (w >> 20) & MSK(2);
+ out[ 9]= (w >> 22) & MSK(2);
+ out[10]= (w >> 24) & MSK(2);
+ out[11]= (w >> 26) & MSK(2);
+ out[12]= (w >> 28) & MSK(2);
+ out[13]= (w >> 30) & MSK(2);
+ out[14]= (w >> 32) & MSK(2);
+ out[15]= (w >> 34) & MSK(2);
+ out[16]= (w >> 36) & MSK(2);
+ out[17]= (w >> 38) & MSK(2);
+ out[18]= (w >> 40) & MSK(2);
+ out[19]= (w >> 42) & MSK(2);
+ out[20]= (w >> 44) & MSK(2);
+ out[21]= (w >> 46) & MSK(2);
+ out[22]= (w >> 48) & MSK(2);
+ out[23]= (w >> 50) & MSK(2);
+ out[24]= (w >> 52) & MSK(2);
+ out[25]= (w >> 54) & MSK(2);
+ out[26]= (w >> 56) & MSK(2);
+ out[27]= (w >> 58) & MSK(2);
+ out[28]= (w >> 60) & MSK(2);
+ out[29]= (w >> 62) & MSK(2); out += 30;
+ break;
+ case 4: ip+=8;
+ out[ 0]= (w >> 4) & MSK(3);
+ out[ 1]= (w >> 7) & MSK(3);
+ out[ 2]= (w >> 10) & MSK(3);
+ out[ 3]= (w >> 13) & MSK(3);
+ out[ 4]= (w >> 16) & MSK(3);
+ out[ 5]= (w >> 19) & MSK(3);
+ out[ 6]= (w >> 22) & MSK(3);
+ out[ 7]= (w >> 25) & MSK(3);
+ out[ 8]= (w >> 28) & MSK(3);
+ out[ 9]= (w >> 31) & MSK(3);
+ out[10]= (w >> 34) & MSK(3);
+ out[11]= (w >> 37) & MSK(3);
+ out[12]= (w >> 40) & MSK(3);
+ out[13]= (w >> 43) & MSK(3);
+ out[14]= (w >> 46) & MSK(3);
+ out[15]= (w >> 49) & MSK(3);
+ out[16]= (w >> 52) & MSK(3);
+ out[17]= (w >> 55) & MSK(3);
+ out[18]= (w >> 58) & MSK(3);
+ out[19]= (w >> 61) & MSK(3); out += 20;
+ break;
+ case 5: ip+=8;
+ out[ 0]= (w >> 4) & MSK(4);
+ out[ 1]= (w >> 8) & MSK(4);
+ out[ 2]= (w >> 12) & MSK(4);
+ out[ 3]= (w >> 16) & MSK(4);
+ out[ 4]= (w >> 20) & MSK(4);
+ out[ 5]= (w >> 24) & MSK(4);
+ out[ 6]= (w >> 28) & MSK(4);
+ out[ 7]= (w >> 32) & MSK(4);
+ out[ 8]= (w >> 36) & MSK(4);
+ out[ 9]= (w >> 40) & MSK(4);
+ out[10]= (w >> 44) & MSK(4);
+ out[11]= (w >> 48) & MSK(4);
+ out[12]= (w >> 52) & MSK(4);
+ out[13]= (w >> 56) & MSK(4);
+ out[14]= (w >> 60) & MSK(4); out += 15;
+ break;
+ case 6: ip+=8;
+ out[ 0]= (w >> 4) & MSK(5);
+ out[ 1]= (w >> 9) & MSK(5);
+ out[ 2]= (w >> 14) & MSK(5);
+ out[ 3]= (w >> 19) & MSK(5);
+ out[ 4]= (w >> 24) & MSK(5);
+ out[ 5]= (w >> 29) & MSK(5);
+ out[ 6]= (w >> 34) & MSK(5);
+ out[ 7]= (w >> 39) & MSK(5);
+ out[ 8]= (w >> 44) & MSK(5);
+ out[ 9]= (w >> 49) & MSK(5);
+ out[10]= (w >> 54) & MSK(5);
+ out[11]= (w >> 59) & MSK(5); out += 12;
+ break;
+ case 7: ip+=8;
+ out[0]= (w >> 4) & MSK(6);
+ out[1]= (w >> 10) & MSK(6);
+ out[2]= (w >> 16) & MSK(6);
+ out[3]= (w >> 22) & MSK(6);
+ out[4]= (w >> 28) & MSK(6);
+ out[5]= (w >> 34) & MSK(6);
+ out[6]= (w >> 40) & MSK(6);
+ out[7]= (w >> 46) & MSK(6);
+ out[8]= (w >> 52) & MSK(6);
+ out[9]= (w >> 58) & MSK(6); out += 10;
+ break;
+ case 8: ip+=8;
+ out[0]= (w >> 4 ) & MSK(7);
+ out[1]= (w >> 11) & MSK(7);
+ out[2]= (w >> 18) & MSK(7);
+ out[3]= (w >> 25) & MSK(7);
+ out[4]= (w >> 32) & MSK(7);
+ out[5]= (w >> 39) & MSK(7);
+ out[6]= (w >> 46) & MSK(7);
+ out[7]= (w >> 53) & MSK(7); out += 8;
+ break;
+ case 9: ip+=8;
+ out[0]= (w >> 4 ) & MSK(8);
+ out[1]= (w >> 12) & MSK(8);
+ out[2]= (w >> 20) & MSK(8);
+ out[3]= (w >> 28) & MSK(8);
+ out[4]= (w >> 36) & MSK(8);
+ out[5]= (w >> 44) & MSK(8);
+ out[6]= (w >> 52) & MSK(8); out += 7;
+ break;
+ case 10: ip+=8;
+ out[0]= (w >> 4) & MSK(10);
+ out[1]= (w >> 14) & MSK(10);
+ out[2]= (w >> 24) & MSK(10);
+ out[3]= (w >> 34) & MSK(10);
+ out[4]= (w >> 44) & MSK(10);
+ out[5]= (w >> 54) & MSK(10); out += 6;
+ break;
+ case 11: ip+=8;
+ out[0]= (w >> 4) & MSK(12);
+ out[1]= (w >> 16) & MSK(12);
+ out[2]= (w >> 28) & MSK(12);
+ out[3]= (w >> 40) & MSK(12);
+ out[4]= (w >> 52) & MSK(12); out += 5;
+ break;
+ case 12: ip+=8;
+ out[0]= (w >> 4) & MSK(15);
+ out[1]= (w >> 19) & MSK(15);
+ out[2]= (w >> 34) & MSK(15);
+ out[3]= (w >> 49) & MSK(15); out += 4;
+ break;
+ case 13: ip+=8;
+ out[0]= (w >> 4) & MSK(20);
+ out[1]= (w >> 24) & MSK(20);
+ out[2]= (w >> 44) & MSK(20); out += 3;
+ break;
+ case 14: ip+=8;
+ out[0]= (w >> 4) & MSK(30);
+ out[1]= (w >> 34) & MSK(30); out += 2;
+ break;
+ case 15: ip+=8;
+ out[0]= (w >> 4) & ((1ull<<60)-1); out += 1;
+ break;
+ }
+ }
+ return ip;
+}
diff --git a/ext/simple8b.h b/ext/simple8b.h
new file mode 100644
index 0000000..1d387d5
--- /dev/null
+++ b/ext/simple8b.h
@@ -0,0 +1,2 @@
+unsigned char *vs8benc(unsigned *__restrict in, int n, unsigned char *__restrict out);
+unsigned char *vs8bdec(unsigned char *__restrict in, int n, unsigned *__restrict out);
diff --git a/ext/vabyte.h b/ext/vabyte.h
new file mode 100644
index 0000000..eb73810
--- /dev/null
+++ b/ext/vabyte.h
@@ -0,0 +1,99 @@
+// "variablebyte.h" C Version port by powturbo from https://github.com/lemire/FastPFor
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ * (c) Daniel Lemire, http://lemire.me/en/
+ */
+
+#define extract7bits(i, val) (val >> (7 * i)) & ((1U << 7) - 1)
+#define extract7bitsmaskless(i, val) (val >> (7 * i))
+
+unsigned char *vbyteenc(unsigned *in, const size_t length, unsigned *out/*,
+ size_t &nvalue*/) {
+ unsigned char *bout = (unsigned char *)(out);
+ //const unsigned char *const initbout = (unsigned char *)(out);
+ //unsigned prev = 0;
+ size_t k;
+ for (k = 0; k < length; ++k) {
+ const unsigned val = /*delta ? in[k] - prev :*/ in[k];
+ //if (delta) prev = in[k];
+ /**
+ * Code below could be shorter. Whether it could be faster
+ * depends on your compiler and machine.
+ */
+ if (val < (1U << 7)) {
+ *bout = (unsigned char)(val | (1U << 7));
+ ++bout;
+ } else if (val < (1U << 14)) {
+ *bout = extract7bits(0,val);
+ ++bout;
+ *bout = extract7bitsmaskless(1,val) | (1U << 7);
+ ++bout;
+ } else if (val < (1U << 21)) {
+ *bout = extract7bits(0,val);
+ ++bout;
+ *bout = extract7bits(1,val);
+ ++bout;
+ *bout = extract7bitsmaskless(2,val) | (1U << 7);
+ ++bout;
+ } else if (val < (1U << 28)) {
+ *bout = extract7bits(0, val);
+ ++bout;
+ *bout = extract7bits(1, val);
+ ++bout;
+ *bout = extract7bits(2, val);
+ ++bout;
+ *bout = extract7bitsmaskless(3, val) | (1U << 7);
+ ++bout;
+ } else {
+ *bout = extract7bits(0,val);
+ ++bout;
+ *bout = extract7bits(1,val);
+ ++bout;
+ *bout = extract7bits(2,val);
+ ++bout;
+ *bout = extract7bits(3,val);
+ ++bout;
+ *bout = extract7bitsmaskless(4,val) | (1U << 7);
+ ++bout;
+ }
+ }
+ /*while (needPaddingTo32Bits(bout)) {
+ *bout++ = 0;
+ }
+ const size_t storageinbytes = bout - initbout;
+ assert((storageinbytes % 4) == 0);
+ nvalue = storageinbytes / 4;*/
+ return bout;
+}
+
+
+unsigned char *vbytedec(const unsigned char *in, const size_t length,
+ unsigned *out/*, size_t &nvalue*/) {
+ unsigned prev = 0;
+ if (length == 0) {
+ //nvalue = 0;
+ return (unsigned char *)in;//abort
+ }
+ const unsigned char *inbyte = (const unsigned char *)(in);
+ const unsigned char *const endbyte = (const unsigned char *)(out
+ + length);
+ //const unsigned *const initout(out);
+
+ while ((unsigned *)endbyte > out) {
+ unsigned int shift = 0; unsigned v;
+ for (v = 0; (unsigned *)endbyte > out; shift += 7) {
+ unsigned char c = *inbyte++;
+ v += ((c & 127) << shift);
+ if ((c & 128)) {
+ *out++ = /*delta ? (prev = v + prev) :*/ v;
+ break;
+ }
+ }
+ }
+ //nvalue = out - initout;
+ //inbyte = padTo32bits(inbyte);
+ return (unsigned char *)inbyte;
+ }
+
diff --git a/ext/varintg8iu.c b/ext/varintg8iu.c
new file mode 100644
index 0000000..dfca2cc
--- /dev/null
+++ b/ext/varintg8iu.c
@@ -0,0 +1,182 @@
+// C port Version of "VarIntG8IU.h" from https://github.com/lemire/FastPFor
+/**
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ */
+/**
+ *
+ * Implementation of varint-G8IU taken from
+ * Stepanov et al., SIMD-Based Decoding of Posting Lists, CIKM 2011
+ *
+ * Update: D. Lemire believes that this scheme was patented by Rose, Stepanov et al. (patent 20120221539).
+ * We wrote this code before the patent was published (August 2012).
+ *
+ * By Maxime Caron
+ * From
+ * https://github.com/maximecaron/SIMD-Based-Posting-lists
+ * with minor modifications by D. Lemire.
+ */
+#include
+#ifndef __SSSE3__
+#pragma message "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3"
+#else
+#ifndef VARINTG8IU_H__
+#define VARINTG8IU_H__
+#include
+//#include "codecs.h"
+#ifdef __GNUC__
+#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
+#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#else
+#define PREDICT_FALSE(x) x
+#define PREDICT_TRUE(x) x
+#endif
+#include "varintg8iu.h"
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+
+static int maskOutputSize[256];
+static char mask[256][32];
+
+ int getNumByteNeeded(const uint32_t value) {
+ if (value > 0x000000FF) {
+ if (value > 0x0000FFFF) {
+ if (value > 0x00FFFFFF) {
+ return 4;
+ } else {
+ return 3;
+ }
+ } else {
+ return 2;
+ }
+ } else {
+ return 1;
+ }
+ }
+
+
+ // For all possible values of the
+ // descriptor we build a table of any shuffle sequence
+ // that might be needed at decode time.
+void VarIntG8IU() {
+ for (int desc = 0; desc <= 255; desc++) {
+ int bitmask = 0x00000001;
+ int bitindex = 0;
+ // count number of 0 in the char
+ int complete = 0;
+ int ithSize[8];
+ int lastpos = -1;
+ while (bitindex < 8) {
+ if ((desc & bitmask) == 0) {
+ ithSize[complete] = bitindex - lastpos;
+ lastpos = bitindex;
+ complete++;
+ }
+ bitindex++;
+ bitmask = bitmask << 1;
+ }
+ maskOutputSize[desc] = complete;
+
+ int j = 0;
+ int k = 0;
+ for (int i = 0; i < complete; i++) {
+ for (int n = 0; n < 4; n++) {
+ if (n < ithSize[i]) {
+ mask[desc][k] = j;
+ j = j + 1;
+ } else {
+ mask[desc][k] = -1;
+ }
+ k = k + 1;
+ }
+ }
+
+ }
+
+ }
+
+unsigned char *vintg8enc(const uint32_t *__restrict in, const size_t length, unsigned char *__restrict out) {
+ const uint32_t *in_ = in + length; //size_t srclength = length * 4;unsigned char* dest = (unsigned char*)(out);size_t dstlength = length * 4;
+ //size_t compressed_size = 0;
+ while(in < in_ /*srclength > 0 && dstlength >= 9*/) { //compressed_size += encodeBlock(in, srclength, dst, nvalue);
+ unsigned char desc = 0xFF;
+ unsigned char bitmask = 0x01;
+ uint32_t buffer[8];
+ int ithSize[8];
+ int length = 0;
+ int numInt = 0;
+
+ while (in < in_ /*srclength > 0*/) {
+ const uint32_t* temp = in;
+ int byteNeeded = getNumByteNeeded(*temp);
+
+ if (PREDICT_FALSE(length + byteNeeded > 8)) {
+ break;
+ }
+
+ //flip the correct bit in desc
+ bitmask = bitmask << (byteNeeded - 1);
+ desc = desc ^ bitmask;
+ bitmask = bitmask << 1;
+
+ ithSize[numInt] = byteNeeded;
+ length += byteNeeded;
+ buffer[numInt] = *temp;
+ ++in;// = in + 1;
+ //srclength -= 4;
+ numInt++;
+ }
+ out[0] = desc;
+ int written = 1;
+ for(int i = 0; i < numInt; i++) {
+ int size = ithSize[i];
+ uint32_t value = buffer[i];
+ for (int j = 0; j < size; j++) {
+ out[written++] = value >> (j * 8);
+ }
+ }
+ out += 9; //dstlength -= 9; //compressed_size += 9;
+ }
+ //Ouput might not be a multiple of 4 so we make it so
+ return out; //out + ((compressed_size + 3)/ 4);
+ }
+
+unsigned char *vintg8dec(unsigned char *__restrict in, const size_t length, uint32_t *__restrict out) {
+ size_t srclength = length * 4;
+ const unsigned *out_ = out + length; //uint32_t * dest = out;size_t nvalue = length * 4; //uint32_t uncompressSize = 0;
+ while (out < out_ /*srclength >= 9*/) { //uncompressSize += decodeBlock(in, srclength, dst/*, nvalue*/);
+ const unsigned char* pdesc = in++;
+ unsigned char desc = *pdesc;
+ srclength -= 1;
+
+ const unsigned char* peek = in;
+ v16qi data;
+ if (PREDICT_TRUE(srclength >= 16)) {
+ // read 16 byte of data only if we need to avoid cache miss
+ data = __builtin_ia32_lddqu((const char*) (peek));
+ } else {
+ static char buff[16];
+ memcpy(buff, peek, 8);
+ data = __builtin_ia32_lddqu(buff);
+ }
+ // load de required mask
+ v16qi shf = __builtin_ia32_lddqu(mask[desc]);
+ v16qi result = __builtin_ia32_pshufb128(data, shf);
+ char* dst = (char*) (out);
+ __builtin_ia32_storedqu(dst, result);
+ int readSize = maskOutputSize[desc];
+
+ if (PREDICT_TRUE( readSize >= 4)) {
+ v16qi shf2 = __builtin_ia32_lddqu(mask[desc] + 16);
+ v16qi result2 = __builtin_ia32_pshufb128(data, shf2);
+ __builtin_ia32_storedqu(dst + (16), result2);
+ }
+ // pop 8 input char
+ in += 8; srclength -= 8; out += readSize; //dstlength -= readSize * 4;// uncompressSize += readSize;
+ }
+ return in; //(uint32_t *) (((uintptr_t) (src) + 3) & ~3);
+
+}
+
+#endif //__SSE3__
+#endif
diff --git a/ext/varintg8iu.h b/ext/varintg8iu.h
new file mode 100644
index 0000000..48c8eac
--- /dev/null
+++ b/ext/varintg8iu.h
@@ -0,0 +1,5 @@
+#include
+void VarIntG8IU();
+unsigned char *vintg8enc(const uint32_t *__restrict in, const size_t length, unsigned char *__restrict out);
+unsigned char *vintg8dec(unsigned char *__restrict in, const size_t length, uint32_t *__restrict out);
+
diff --git a/ext/vas16c.h b/ext/vas16c.h
new file mode 100644
index 0000000..84fffd4
--- /dev/null
+++ b/ext/vas16c.h
@@ -0,0 +1,36 @@
+// optimized version from: http://jinruhe.com/
+static int s16_cnum[16] = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1};
+static int s16_cbits[16][28] = {
+ {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1},
+ {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0},
+ {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0},
+ {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0},
+ {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0},
+ {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} };
+
+#define S16ENC(__w, __p, m) { unsigned *_p = __p, *_w = __w; \
+ unsigned int _k, _j, _m, _o; \
+ for (_k = 0; _k < 16; _k++) { \
+ *_w = _k<<28; \
+ _m = (s16_cnum[_k] < m)? s16_cnum[_k]:m; \
+ for (_j = 0, _o = 0; (_j < _m) && (*(_p+_j) < (1<>28) {\
+ case 0:\
+ _p[ 0] = (_rw ) & 1;\
+ _p[ 1] = (_rw>> 1) & 1;\
+ _p[ 2] = (_rw>> 2) & 1;\
+ _p[ 3] = (_rw>> 3) & 1;\
+ _p[ 4] = (_rw>> 4) & 1;\
+ _p[ 5] = (_rw>> 5) & 1;\
+ _p[ 6] = (_rw>> 6) & 1;\
+ _p[ 7] = (_rw>> 7) & 1;\
+ _p[ 8] = (_rw>> 8) & 1;\
+ _p[ 9] = (_rw>> 9) & 1;\
+ _p[10] = (_rw>>10) & 1;\
+ _p[11] = (_rw>>11) & 1;\
+ _p[12] = (_rw>>12) & 1;\
+ _p[13] = (_rw>>13) & 1;\
+ _p[14] = (_rw>>14) & 1;\
+ _p[15] = (_rw>>15) & 1;\
+ _p[16] = (_rw>>16) & 1;\
+ _p[17] = (_rw>>17) & 1;\
+ _p[18] = (_rw>>18) & 1;\
+ _p[19] = (_rw>>19) & 1;\
+ _p[20] = (_rw>>20) & 1;\
+ _p[21] = (_rw>>21) & 1;\
+ _p[22] = (_rw>>22) & 1;\
+ _p[23] = (_rw>>23) & 1;\
+ _p[24] = (_rw>>24) & 1;\
+ _p[25] = (_rw>>25) & 1;\
+ _p[26] = (_rw>>26) & 1;\
+ _p[27] = (_rw>>27) & 1; _p += 28;\
+ break;\
+ case 1: \
+ _p[ 0] = (_rw ) & 3;\
+ _p[ 1] = (_rw>> 2) & 3;\
+ _p[ 2] = (_rw>> 4) & 3;\
+ _p[ 3] = (_rw>> 6) & 3;\
+ _p[ 4] = (_rw>> 8) & 3;\
+ _p[ 5] = (_rw>>10) & 3;\
+ _p[ 6] = (_rw>>12) & 3;\
+ _p[ 7] = (_rw>>14) & 1;\
+ _p[ 8] = (_rw>>15) & 1;\
+ _p[ 9] = (_rw>>16) & 1;\
+ _p[10] = (_rw>>17) & 1;\
+ _p[11] = (_rw>>18) & 1;\
+ _p[12] = (_rw>>19) & 1;\
+ _p[13] = (_rw>>20) & 1;\
+ _p[14] = (_rw>>21) & 1;\
+ _p[15] = (_rw>>22) & 1;\
+ _p[16] = (_rw>>23) & 1;\
+ _p[17] = (_rw>>24) & 1;\
+ _p[18] = (_rw>>25) & 1;\
+ _p[19] = (_rw>>26) & 1;\
+ _p[20] = (_rw>>27) & 1; _p += 21; \
+ break; \
+ case 2: \
+ _p[0] = (_rw) & 1; \
+ _p[1] = (_rw>>1) & 1;\
+ _p[2] = (_rw>>2) & 1;\
+ _p[3] = (_rw>>3) & 1;\
+ _p[4] = (_rw>>4) & 1;\
+ _p[5] = (_rw>>5) & 1;\
+ _p[6] = (_rw>>6) & 1;\
+ _p[7] = (_rw>>7) & 3;\
+ _p[8] = (_rw>>9) & 3;\
+ _p[9] = (_rw>>11) & 3;\
+ _p[10] = (_rw>>13) & 3;\
+ _p[11] = (_rw>>15) & 3;\
+ _p[12] = (_rw>>17) & 3;\
+ _p[13] = (_rw>>19) & 3;\
+ _p[14] = (_rw>>21) & 1;\
+ _p[15] = (_rw>>22) & 1;\
+ _p[16] = (_rw>>23) & 1;\
+ _p[17] = (_rw>>24) & 1;\
+ _p[18] = (_rw>>25) & 1;\
+ _p[19] = (_rw>>26) & 1;\
+ _p[20] = (_rw>>27) & 1; _p += 21;\
+ break; \
+ case 3: \
+ _p[0] = (_rw) & 1; \
+ _p[1] = (_rw>>1) & 1;\
+ _p[2] = (_rw>>2) & 1;\
+ _p[3] = (_rw>>3) & 1;\
+ _p[4] = (_rw>>4) & 1;\
+ _p[5] = (_rw>>5) & 1;\
+ _p[6] = (_rw>>6) & 1;\
+ _p[7] = (_rw>>7) & 1;\
+ _p[8] = (_rw>>8) & 1;\
+ _p[9] = (_rw>>9) & 1;\
+ _p[10] = (_rw>>10) & 1;\
+ _p[11] = (_rw>>11) & 1;\
+ _p[12] = (_rw>>12) & 1;\
+ _p[13] = (_rw>>13) & 1;\
+ _p[14] = (_rw>>14) & 3;\
+ _p[15] = (_rw>>16) & 3;\
+ _p[16] = (_rw>>18) & 3;\
+ _p[17] = (_rw>>20) & 3;\
+ _p[18] = (_rw>>22) & 3;\
+ _p[19] = (_rw>>24) & 3;\
+ _p[20] = (_rw>>26) & 3; _p += 21;\
+ break; \
+ case 4: \
+ _p[ 0] = (_rw ) & 3;\
+ _p[ 1] = (_rw>> 2) & 3;\
+ _p[ 2] = (_rw>> 4) & 3;\
+ _p[ 3] = (_rw>> 6) & 3;\
+ _p[ 4] = (_rw>> 8) & 3;\
+ _p[ 5] = (_rw>>10) & 3;\
+ _p[ 6] = (_rw>>12) & 3;\
+ _p[ 7] = (_rw>>14) & 3;\
+ _p[ 8] = (_rw>>16) & 3;\
+ _p[ 9] = (_rw>>18) & 3;\
+ _p[10] = (_rw>>20) & 3;\
+ _p[11] = (_rw>>22) & 3;\
+ _p[12] = (_rw>>24) & 3;\
+ _p[13] = (_rw>>26) & 3; _p += 14;\
+ break; \
+ case 5: \
+ _p[0] = (_rw) & 15; \
+ _p[1] = (_rw>>4) & 7;\
+ _p[2] = (_rw>>7) & 7;\
+ _p[3] = (_rw>>10) & 7;\
+ _p[4] = (_rw>>13) & 7;\
+ _p[5] = (_rw>>16) & 7;\
+ _p[6] = (_rw>>19) & 7;\
+ _p[7] = (_rw>>22) & 7;\
+ _p[8] = (_rw>>25) & 7; _p += 9;\
+ break; \
+ case 6: \
+ _p[0] = (_rw) & 7; \
+ _p[1] = (_rw>>3) & 15;\
+ _p[2] = (_rw>>7) & 15;\
+ _p[3] = (_rw>>11) & 15;\
+ _p[4] = (_rw>>15) & 15;\
+ _p[5] = (_rw>>19) & 7;\
+ _p[6] = (_rw>>22) & 7;\
+ _p[7] = (_rw>>25) & 7; _p += 8;\
+ break; \
+ case 7: \
+ _p[0] = (_rw) & 15; \
+ _p[1] = (_rw>>4) & 15;\
+ _p[2] = (_rw>>8) & 15;\
+ _p[3] = (_rw>>12) & 15;\
+ _p[4] = (_rw>>16) & 15;\
+ _p[5] = (_rw>>20) & 15;\
+ _p[6] = (_rw>>24) & 15; _p += 7;\
+ break; \
+ case 8: \
+ _p[0] = (_rw ) & 31;\
+ _p[1] = (_rw>> 5) & 31;\
+ _p[2] = (_rw>>10) & 31;\
+ _p[3] = (_rw>>15) & 31;\
+ _p[4] = (_rw>>20) & 15;\
+ _p[5] = (_rw>>24) & 15; _p += 6;\
+ break; \
+ case 9: \
+ _p[0] = (_rw) & 15; \
+ _p[1] = (_rw>>4) & 15;\
+ _p[2] = (_rw>>8) & 31;\
+ _p[3] = (_rw>>13) & 31;\
+ _p[4] = (_rw>>18) & 31;\
+ _p[5] = (_rw>>23) & 31; _p += 6;\
+ break; \
+ case 10: \
+ _p[0] = (_rw) & 63; \
+ _p[1] = (_rw>>6) & 63;\
+ _p[2] = (_rw>>12) & 63;\
+ _p[3] = (_rw>>18) & 31;\
+ _p[4] = (_rw>>23) & 31; _p += 5;\
+ break; \
+ case 11: \
+ _p[0] = (_rw) & 31; \
+ _p[1] = (_rw>>5) & 31;\
+ _p[2] = (_rw>>10) & 63;\
+ _p[3] = (_rw>>16) & 63;\
+ _p[4] = (_rw>>22) & 63; _p += 5;\
+ break; \
+ case 12: \
+ _p[0] = (_rw) & 127; \
+ _p[1] = (_rw>>7) & 127;\
+ _p[2] = (_rw>>14) & 127;\
+ _p[3] = (_rw>>21) & 127; _p += 4;\
+ break; \
+ case 13: \
+ _p[0] = (_rw) & 1023; \
+ _p[1] = (_rw>>10) & 511;\
+ _p[2] = (_rw>>19) & 511; _p += 3;\
+ break; \
+ case 14: \
+ _p[0] = (_rw) & 16383; \
+ _p[1] = (_rw>>14) & 16383; _p += 2;\
+ break; \
+ case 15: \
+ _p[0] = (_rw) & ((1<<28)-1); _p++; \
+ break; \
+ } \
+}
+static inline unsigned char *vs16dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n; while(out < out_) S16DEC(in, out, ;); return (unsigned char *)in; }
+
+#if 0
+#define BREAK _rw = *_in++; goto *_lab[__out<_oute?((_rw)>>28):16]
+
+#define s16dec(__in, __n, __pout) ({\
+ __label__ _lab0,_lab1,_lab2,_lab3,_lab4,_lab5,_lab6,_lab7,_lab8,_lab9,_lab10,_lab11,_lab12,_lab13,_lab14,_lab15,_labend;\
+ static void *_lab[] = { &&_lab0, &&_lab1, &&_lab2, &&_lab3, &&_lab4, &&_lab5, &&_lab6, &&_lab7, &&_lab8, &&_lab9, &&_lab10, &&_lab11, &&_lab12, &&_lab13, &&_lab14, &&_lab15, &&_labend };\
+ unsigned *_in = __in; typeof(__pout[0]) *__out = __pout, *_oute = __out+(__n); register unsigned _rw = *_in++; goto *_lab[(_rw)>>28];\
+ _lab0:\
+ __out[0] = (_rw) & 1; \
+ __out[1] = (_rw>>1) & 1; \
+ __out[2] = (_rw>>2) & 1; \
+ __out[3] = (_rw>>3) & 1; \
+ __out[4] = (_rw>>4) & 1; \
+ __out[5] = (_rw>>5) & 1; \
+ __out[6] = (_rw>>6) & 1; \
+ __out[7] = (_rw>>7) & 1; \
+ __out[8] = (_rw>>8) & 1; \
+ __out[9] = (_rw>>9) & 1; \
+ __out[10] = (_rw>>10) & 1; \
+ __out[11] = (_rw>>11) & 1; \
+ __out[12] = (_rw>>12) & 1; \
+ __out[13] = (_rw>>13) & 1; \
+ __out[14] = (_rw>>14) & 1; \
+ __out[15] = (_rw>>15) & 1; \
+ __out[16] = (_rw>>16) & 1; \
+ __out[17] = (_rw>>17) & 1; \
+ __out[18] = (_rw>>18) & 1; \
+ __out[19] = (_rw>>19) & 1; \
+ __out[20] = (_rw>>20) & 1; \
+ __out[21] = (_rw>>21) & 1; \
+ __out[22] = (_rw>>22) & 1; \
+ __out[23] = (_rw>>23) & 1; \
+ __out[24] = (_rw>>24) & 1; \
+ __out[25] = (_rw>>25) & 1; \
+ __out[26] = (_rw>>26) & 1; \
+ __out[27] = (_rw>>27) & 1; __out += 28;\
+ BREAK; \
+ _lab1: \
+ __out[0] = (_rw) & 3; \
+ __out[1] = (_rw>>2) & 3; \
+ __out[2] = (_rw>>4) & 3; \
+ __out[3] = (_rw>>6) & 3; \
+ __out[4] = (_rw>>8) & 3; \
+ __out[5] = (_rw>>10) & 3; \
+ __out[6] = (_rw>>12) & 3; \
+ __out[7] = (_rw>>14) & 1; \
+ __out[8] = (_rw>>15) & 1; \
+ __out[9] = (_rw>>16) & 1; \
+ __out[10] = (_rw>>17) & 1; \
+ __out[11] = (_rw>>18) & 1; \
+ __out[12] = (_rw>>19) & 1; \
+ __out[13] = (_rw>>20) & 1; \
+ __out[14] = (_rw>>21) & 1; \
+ __out[15] = (_rw>>22) & 1; \
+ __out[16] = (_rw>>23) & 1; \
+ __out[17] = (_rw>>24) & 1; \
+ __out[18] = (_rw>>25) & 1; \
+ __out[19] = (_rw>>26) & 1; \
+ __out[20] = (_rw>>27) & 1; __out += 21; \
+ BREAK; \
+ _lab2: \
+ __out[0] = (_rw) & 1; \
+ __out[1] = (_rw>>1) & 1; \
+ __out[2] = (_rw>>2) & 1; \
+ __out[3] = (_rw>>3) & 1; \
+ __out[4] = (_rw>>4) & 1; \
+ __out[5] = (_rw>>5) & 1; \
+ __out[6] = (_rw>>6) & 1; \
+ __out[7] = (_rw>>7) & 3; \
+ __out[8] = (_rw>>9) & 3; \
+ __out[9] = (_rw>>11) & 3; \
+ __out[10] = (_rw>>13) & 3; \
+ __out[11] = (_rw>>15) & 3; \
+ __out[12] = (_rw>>17) & 3; \
+ __out[13] = (_rw>>19) & 3; \
+ __out[14] = (_rw>>21) & 1; \
+ __out[15] = (_rw>>22) & 1; \
+ __out[16] = (_rw>>23) & 1; \
+ __out[17] = (_rw>>24) & 1; \
+ __out[18] = (_rw>>25) & 1; \
+ __out[19] = (_rw>>26) & 1; \
+ __out[20] = (_rw>>27) & 1; __out += 21;\
+ BREAK; \
+ _lab3: \
+ __out[0] = (_rw) & 1; \
+ __out[1] = (_rw>>1) & 1; \
+ __out[2] = (_rw>>2) & 1; \
+ __out[3] = (_rw>>3) & 1; \
+ __out[4] = (_rw>>4) & 1; \
+ __out[5] = (_rw>>5) & 1; \
+ __out[6] = (_rw>>6) & 1; \
+ __out[7] = (_rw>>7) & 1; \
+ __out[8] = (_rw>>8) & 1; \
+ __out[9] = (_rw>>9) & 1; \
+ __out[10] = (_rw>>10) & 1; \
+ __out[11] = (_rw>>11) & 1; \
+ __out[12] = (_rw>>12) & 1; \
+ __out[13] = (_rw>>13) & 1; \
+ __out[14] = (_rw>>14) & 3; \
+ __out[15] = (_rw>>16) & 3; \
+ __out[16] = (_rw>>18) & 3; \
+ __out[17] = (_rw>>20) & 3; \
+ __out[18] = (_rw>>22) & 3; \
+ __out[19] = (_rw>>24) & 3; \
+ __out[20] = (_rw>>26) & 3; __out += 21;\
+ BREAK; \
+ _lab4: \
+ __out[0] = (_rw) & 3; \
+ __out[1] = (_rw>>2) & 3; \
+ __out[2] = (_rw>>4) & 3; \
+ __out[3] = (_rw>>6) & 3; \
+ __out[4] = (_rw>>8) & 3; \
+ __out[5] = (_rw>>10) & 3; \
+ __out[6] = (_rw>>12) & 3; \
+ __out[7] = (_rw>>14) & 3; \
+ __out[8] = (_rw>>16) & 3; \
+ __out[9] = (_rw>>18) & 3; \
+ __out[10] = (_rw>>20) & 3; \
+ __out[11] = (_rw>>22) & 3; \
+ __out[12] = (_rw>>24) & 3; \
+ __out[13] = (_rw>>26) & 3; __out += 14;\
+ BREAK; \
+ _lab5: \
+ __out[0] = (_rw) & 15; \
+ __out[1] = (_rw>>4) & 7; \
+ __out[2] = (_rw>>7) & 7; \
+ __out[3] = (_rw>>10) & 7; \
+ __out[4] = (_rw>>13) & 7; \
+ __out[5] = (_rw>>16) & 7; \
+ __out[6] = (_rw>>19) & 7; \
+ __out[7] = (_rw>>22) & 7; \
+ __out[8] = (_rw>>25) & 7; __out += 9;\
+ BREAK; \
+ _lab6: \
+ __out[0] = (_rw) & 7; \
+ __out[1] = (_rw>>3) & 15; \
+ __out[2] = (_rw>>7) & 15; \
+ __out[3] = (_rw>>11) & 15; \
+ __out[4] = (_rw>>15) & 15; \
+ __out[5] = (_rw>>19) & 7; \
+ __out[6] = (_rw>>22) & 7; \
+ __out[7] = (_rw>>25) & 7; __out += 8;\
+ BREAK; \
+ _lab7: \
+ __out[0] = (_rw) & 15; \
+ __out[1] = (_rw>>4) & 15; \
+ __out[2] = (_rw>>8) & 15; \
+ __out[3] = (_rw>>12) & 15; \
+ __out[4] = (_rw>>16) & 15; \
+ __out[5] = (_rw>>20) & 15; \
+ __out[6] = (_rw>>24) & 15; __out += 7;\
+ BREAK; \
+ _lab8: \
+ __out[0] = (_rw) & 31; \
+ __out[1] = (_rw>>5) & 31; \
+ __out[2] = (_rw>>10) & 31; \
+ __out[3] = (_rw>>15) & 31; \
+ __out[4] = (_rw>>20) & 15; \
+ __out[5] = (_rw>>24) & 15; __out += 6;\
+ BREAK; \
+ _lab9: \
+ __out[0] = (_rw) & 15; \
+ __out[1] = (_rw>>4) & 15; \
+ __out[2] = (_rw>>8) & 31; \
+ __out[3] = (_rw>>13) & 31; \
+ __out[4] = (_rw>>18) & 31; \
+ __out[5] = (_rw>>23) & 31; __out += 6;\
+ BREAK; \
+ _lab10: \
+ __out[0] = (_rw) & 63; \
+ __out[1] = (_rw>>6) & 63; \
+ __out[2] = (_rw>>12) & 63; \
+ __out[3] = (_rw>>18) & 31; \
+ __out[4] = (_rw>>23) & 31; __out += 5;\
+ BREAK; \
+ _lab11: \
+ __out[0] = (_rw) & 31; \
+ __out[1] = (_rw>>5) & 31; \
+ __out[2] = (_rw>>10) & 63; \
+ __out[3] = (_rw>>16) & 63; \
+ __out[4] = (_rw>>22) & 63; __out += 5;\
+ BREAK; \
+ _lab12: \
+ __out[0] = (_rw) & 127; \
+ __out[1] = (_rw>>7) & 127; \
+ __out[2] = (_rw>>14) & 127; \
+ __out[3] = (_rw>>21) & 127; __out += 4;\
+ BREAK; \
+ _lab13: \
+ __out[0] = (_rw) & 1023; \
+ __out[1] = (_rw>>10) & 511; \
+ __out[2] = (_rw>>19) & 511; __out += 3;\
+ BREAK; \
+ _lab14:\
+ __out[0] = (_rw) & 16383; \
+ __out[1] = (_rw>>14) & 16383; __out += 2;\
+ BREAK; \
+ _lab15:\
+ __out[0] = (_rw) & ((1<<28)-1); __out++; \
+ BREAK;\
+ _labend:;(_in-1);\
+})
+#endif
diff --git a/ext/vbyte_poly.h b/ext/vbyte_poly.h
new file mode 100644
index 0000000..3c2668d
--- /dev/null
+++ b/ext/vbyte_poly.h
@@ -0,0 +1,46 @@
+//
+#define VBYTE_ENC(_v, _n) \
+{\
+ unsigned _num; \
+ unsigned char _barray[5]; \
+ unsigned _i, _started = 0; \
+ _num = _n; \
+ for (_i = 0; _i < 5; _i++) \
+ { \
+ _barray[_i] = ((_num%128)<<1); \
+ _num = _num/128; \
+ } \
+ for (_i = 4; _i > 0; _i--) \
+ { \
+ if ((_barray[_i] != 0) || (_started == 1)) \
+ { \
+ _started = 1; \
+ *_v = _barray[_i]|0x1; \
+ _v++; \
+ } \
+ } \
+ *_v = _barray[0]|0x0; \
+ _v++; \
+}
+
+#define VBYTE_DEC(_v, _n) \
+{\
+ _n = ((*_v>>1)); \
+ if ((*_v&0x1) != 0) \
+ { \
+ _v++; \
+ _n = (_n<<7) + ((*_v>>1)); \
+ if ((*_v&0x1)!= 0) \
+ { \
+ _v++; \
+ _n = (_n<<7) + ((*_v>>1)); \
+ if ((*_v&0x1) != 0) \
+ { \
+ _v++; \
+ _n = (_n<<7) + ((*_v>>1)); \
+ }\
+ }\
+ }\
+ _v++; \
+}
+
diff --git a/icbench.c b/icbench.c
index d417e9f..d073c76 100644
--- a/icbench.c
+++ b/icbench.c
@@ -1,7 +1,7 @@
/**
Copyright (C) powturbo 2013-2014
GPL v2 License
-
+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
@@ -16,321 +16,351 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
icbench.c - "Integer Compression" benchmark program
**/
-
+
+#define _LARGEFILE64_SOURCE 1
+#define _FILE_OFFSET_BITS 64
#include
#include
#include
#include
-
+#include
+
#include
#include
-#define PGM_FD(__f) struct stat sbuf; fstat(__f, &sbuf); __off64_t vlen = sbuf.st_size, vtel = 0; int pgm = 0; time_t t0 = time(NULL);
-#define PGM_FDPUT(__f) vtel = lseek(__f, 0, SEEK_CUR);if(vtel*10/vlen != pgm) { double secs = time(NULL) - t0; pgm = vtel*10/vlen; printf("%d%%%.1f ", pgm, ((secs/60.0) * (vlen - vtel))/vtel); fflush(stdout); }
-//-------------------------------------------------------------------------------------------------------------
+#include
+
+// simple-8b simple16 optpfd don't work with all interger lists.
+// Enable if you to want to test
+//#define USE_SIMPLE_8B // crashs on some lists
+//#define USE_SIMPLE16 // limited to 28 bits
+//#define USE_OPTPFD // compression too slow and limited to 28 bits. crashs on some lists
+#define STATS
+//---------------------------------------- Platform ------------------------
+ #ifdef _WIN32
+#define srand48(x) srand(x)
+#define drand48() ((double)(rand()) / RAND_MAX)
+#define __off64_t _off64_t
+ #endif
+//---------------------------------------- Time ---------------------------------------------------------------------
typedef unsigned long long tm_t;
#define TM_TMAX (1ull<<63)
- #ifdef _MSC_VER // __rdtsc
-#include
- #else
-#include
- #endif
-
- #ifdef _WIN32
-#include
-#define TM_T 1
-
-static tm_t tmtime(void) {
- LARGE_INTEGER tm;
- QueryPerformanceCounter(&tm);
- return (tm_t)(tm.QuadPart/tps.QuadPart);
-}
-
-LARGE_INTEGER tps;
-static tm_t tminit() { QueryPerformanceFrequency(&tps); tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; }
- #else
#include
#define TM_T 1000000.0
-static tm_t tmtime(void) {
- struct timeval tm;
- gettimeofday(&tm, NULL);
- return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec;
-}
+static tm_t tmtime(void) { struct timeval tm; gettimeofday(&tm, NULL); return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; }
+static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; }
+static double tmsec( tm_t tm) { return (double)tm/1000000.0; }
+static double tmmsec(tm_t tm) { return (double)tm/1000.0; }
-static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; }
- #endif
-//--------------------------------------------------------------------------------------------------------
-#include "vint.h"
+//-------------------------------------- TurboPFor ------------------------------------------------------------------
+#include "vint.h"
#include "vsimple.h"
#include "bitpack.h"
#include "bitunpack.h"
+
#include "vp4dc.h"
#include "vp4dd.h"
-#include "aux/vas16c.h"
-#include "aux/vas16d.h"
-#include "aux/OPT_PFD/opt_p4.h"
-#include "aux/vabyte.h"
-#include "aux/simple8b.h"
-#include "aux/varintg8iu.h"
+unsigned char *u32enc(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *in_ = in +n; while(in < in_) *out++ = *in++; return (unsigned char *)out;}
+unsigned char *u32dec(unsigned *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return (unsigned char *)in; }
+
+#define PAD8(__x) (((__x)+7)/8)
+unsigned char *_bitunpackx32(unsigned char *__restrict in, unsigned n, unsigned b, unsigned *__restrict out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); }
+
+unsigned char *bitdunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
+unsigned char *bitd0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); }
+unsigned char *bitfunpackx32( unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+i+1; return in + PAD8(n*b); }
+unsigned char *bitf0unpackx32(unsigned char *__restrict in, unsigned n, unsigned b, int start, unsigned *__restrict out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start; return in + PAD8(n*b); }
+//-------------------------------------- External functions for comparison ------------------------------------------------------------------------
+#include "ext/vas16c.h"
+#include "ext/vas16d.h"
+#include "ext/OPT_PFD/opt_p4.h"
+#include "ext/vabyte.h"
+#include "ext/simple8b.h"
+#include "ext/varintg8iu.h"
+#include "ext/varintg8iu.h"
+#include "ext/simdcomp/include/simdbitpacking.h"
unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456;
uint32_t *in_;
for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b);
- return out;
+ return (unsigned char *)out;
}
unsigned char *simdpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456;
uint32_t *in_;
for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpack(in, (__m128i *)out, b);
- return out;
+ return (unsigned char *)out;
}
unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {
uint32_t k, *out_;
- for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack(in, out, b);
- return in;
+ for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack((const __m128i *)in, out, b);
+ return (unsigned char *)in;
}
unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456;
uint32_t *in_;
for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); //simdpackwithoutmaskd1(x, ip+1, (__m128i *)out, b);
- return out;
+ return (unsigned char *)out;
}
unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {
uint32_t k, *out_;
for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, in, out, b);
- return in;
+ return (unsigned char *)in;
}
-unsigned char *u32enc(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) *out++ = *in++; return out;}
-unsigned char *u32dec(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return in;}
-
-#include "aux/vbyte_poly.h"
-unsigned char *vavbyte1enc(int *in, int n, unsigned char *out) {
+#include "ext/vbyte_poly.h"
+unsigned char *vbpolyenc(int *in, int n, unsigned char *out) {
int i; for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } return out;
}
-void vavbyte1dec(unsigned char *in, int n, int *out) {
- int i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return out;
+unsigned char *vbpolydec(unsigned char *in, int n, int *out) {
+ int i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return in;
}
-
-//-------------------------------------------------------------------------------------------------
-#define VBLIM 64
+//-------------------------------------------------------------------------------------------------------------------
+#define BLK_SIZE (64*1024)
+#define PACK_SIZE 128
enum {
P_CPY,
- P_VB, P_VBL, P_VG8,
+ P_VB, P_VBL, P_VG8, P_VBP, P_VBI,
P_PCK, P_PCKR, P_SIMDH,
- P_SV, P_S16, P_S8BO,
- P_P4D, P_P4DR, P_OPTP4
-};
-
-unsigned char *beenc(unsigned *in, size_t n, unsigned char *out, int id, int bb) {
- unsigned *ip=in;
- int i,b;
+ P_SV, P_S16, P_S64,
+ P_P4D, P_P4DR, P_OPTP4,
+};
+//------------------------------------------------ random integer array (not sorted) ---------------------------------------------------------------------------
+unsigned char *beenc(unsigned *in, size_t n, unsigned char *out, int id, int b) { int i;
switch(id) {
- case P_CPY:
- out = u32enc( ip, n, out); break;
- case P_VB:
- out = vbenc( ip, n, out); break;
- case P_VBL:
- out = vbyteenc( ip, n, out); break;
- case P_VG8:
- out = vintg8enc(ip, n, out); break;
+ case P_CPY: return u32enc( in, n, (unsigned *)out);
- //----------- simple -------------------
- case P_SV:
- out = vsenc32( ip, n, out); break;
- case P_S16:
- { unsigned *c=ip,*ce=c+n;
- while(c < ce) S16ENC(out, c, ce - c);
- }
- break;
- case P_S8BO:
- out = s8benco( ip, n, out);
- break;
+ // --------- variable byte -------------------------------------------
+ case P_VB: return vbenc( in, n, out);
- //----------- PFOR -------------------
+ case P_VBL: return vbyteenc( in, n, (unsigned *)out);
+ case P_VG8: return vintg8enc(in, n, out);
+ case P_VBP: return vbpolyenc(in, n, out);
+ // --------- simple family: simple16, simpleV, simple64 ---------------
+ case P_SV: return vsenc32( in, n, out);
+
+ case P_S16: return vs16enc( in, n, (unsigned *)out);
+ case P_S64: return vs8benc( in, n, out);
+ // --------- PFor -----------------------------------------------------
case P_P4DR:
- case P_P4D:
- if(n>= 5;
- }
- *op = x;
- in = bitunpack32( in, n-1, b, op+1);
- }
- break;
- case P_PCKR:
- {
- unsigned x;
- vbgeta(in, x, ;);
- if(bb < 0) {
- b = x & 0x1f; x >>= 5;
- }
- *op = x;
- in = _bitunpackx32(in, n-1, b, op+1);
- }
- break;
- case P_SIMDH:
- if(n <129) in = vbytedec(in, n, op);
- else {
- unsigned x;
- vbgeta(in, x, ;);
- if(bb < 0) {
- b = x & 0x1f; x >>= 5;
- }
- *op = x;
- in = simdunpackn( in, n-1, b, op+1);
- }
- break;
- default: printf("Fatal- Not entry %d", id); exit(0);
+unsigned char *bedec(unsigned char *in, size_t n, unsigned *out, int id, int b) {
+ switch(id) {
+ case P_CPY: return u32dec( (unsigned *)in, n, out);
+ // --------- variable byte -------------------------------------------
+ case P_VB: return vbdec( in, n, out);
+
+ case P_VBL: return vbytedec( in, n, out);
+ case P_VG8: return vintg8dec(in, n, out);
+ case P_VBP: return vbpolydec(in, n, out);
+
+ // --------- simple family: simple16, simpleV, simple64 ---------------
+ case P_SV: return vsdec32( in, n, out);
+
+ case P_S16: return vs16dec( (unsigned *)in, n, out);
+ case P_S64: return vs8bdec( in, n, out);
+
+ // --------- PFor -----------------------------------------------------
+ case P_OPTP4 : if(n < 128) return vbytedec(in, n, out); else { unsigned all_array[2048]; return (unsigned char *)detailed_p4_decode(out, (unsigned *)in, all_array); }
+ case P_P4D : return p4ddec32( in, n, out);
+ case P_P4DR : return p4ddecx32( in, n, out);
+
+ // --------- bit packing -------------------------------------------
+ case P_PCK: if(b < 0) b = *in++; return bitunpack32( in, n, b, out);
+ case P_PCKR: if(b < 0) b = *in++; return _bitunpackx32( in, n, b, out);
+
+ case P_SIMDH:
+ if(n < 128) return vbytedec(in, n, out);
+ else { if(b < 0) b = *in++; return simdunpackn( (unsigned *)in, n, b, out); }
+ default: die("Fatal- Not entry %d", id);
}
return in;
}
-struct libss { int id; char *s,*v; };
+//------------------------------------------------- Sorted integer array : Delta/Differential compression ------------------------------------------------
+//#define DELTA(in, n, mode, pa) for(pa[0]=in[0],v = 1; v < n; v++) pa[v] = in[v] - in[v-1] - mode
+#define DELTA( __in, __n, __mode, __pa) { unsigned _v; for( __pa[0]=__in[0],_v = __n-1; _v > 0; --_v) __pa[_v] = (__in[_v] - __in[_v-1]) - __mode; }
+#define DELTAB(__in, __n, __mode, __b, __pa) { unsigned _v; for(__b=0,__pa[0]=__in[0],_v = __n-1; _v > 0; --_v) __pa[_v] = (__in[_v] - __in[_v-1]) - __mode, __b |= __pa[_v]; __b = bsr32(__b); }
+
+#define DELTR( __in, __n, __mode, __pa) { unsigned _v; for( __pa[0]=__in[0],_v = 1; _v < __n; _v++) __pa[_v] = (__in[_v] - __pa[0]) - _v*__mode; }
+#define DELTRB(__in, __n, __mode, __b, __pa) { unsigned _v; for(__b=0,__pa[0]=__in[0],_v = 1; _v < __n; _v++) __pa[_v] = (__in[_v] - __pa[0]) - _v*__mode, __b |= __pa[_v]; __b = bsr32(__b); }
+
+unsigned char *besenc(unsigned *in, size_t n, unsigned char *out, int id, int mode) {
+ unsigned pa[BLK_SIZE+2048],x; unsigned b;
+
+ switch(id) {
+ case P_CPY: return u32enc( in, n, (unsigned *)out);
+ //----------- Variable byte ----------------------------------------------------------------------------------------
+ case P_VB: DELTA( in, n, mode, pa); return vbenc( pa, n, out);
+
+ case P_VBL: DELTA( in, n, mode, pa); return vbyteenc( pa, n, (unsigned *)out);
+ case P_VBP: DELTA( in, n, mode, pa); return vbpolyenc(pa, n, out);
+ case P_VG8: DELTA( in, n, mode, pa); return vintg8enc(pa, n, out);
+ // --------- Simple family ---------
+ case P_SV: DELTA( in, n, mode, pa); vbput(out, pa[0]); return vsenc32( pa+1, n-1, out);
+
+ case P_S16: DELTAB(in, n, mode, b, pa); if(b>28) die("simple16 overflow.bits size>28\n");
+ vbput(out, pa[0]); return vs16enc( pa+1, n-1, (unsigned *)out);
+ case P_S64: DELTA( in, n, mode, pa); if(b>28) die("simple-8b overflow.bits size>28\n");
+ vbput(out, pa[0]); return vs8benc( pa+1, n-1, out);
+ // --------- PFor -------------------------------------------------------------------------------------------------
+ case P_P4D: DELTA( in, n, mode, pa); vbput(out, pa[0]); return p4denc32( pa+1, n-1, out);
+ case P_P4DR: DELTR( in, n, mode, pa); vbput(out, pa[0]); return p4denc32( pa+1, n-1, out);
+
+ case P_OPTP4: DELTAB(in, n, mode, b, pa); if(b>28) die("optp4 overflow.bits size>28\n");
+ if(n < 129) { return vbenc(pa, n, out); }
+ else { vbput(out, pa[0]); return out + OPT4(pa+1, n-1, (unsigned *)out); }
+ // --------- bit packing -----------------------------------------------------------------------------------------------
+ case P_PCK: DELTAB(in, n, mode, b, pa); vbput(out, pa[0]); *out++=b; return bitpack32(pa+1, n-1, b, out);
+ case P_PCKR: DELTRB(in, n, mode, b, pa); vbput(out, pa[0]); *out++=b; return bitpack32(pa+1, n-1, b, out);
+
+ case P_SIMDH:
+ if(n < 129) { DELTA(in, n, mode, pa); return vbyteenc((unsigned *)pa, n, (unsigned *)out); }
+ else { b = simdmaxbitsd1(in[0], in+1); vbput(out, in[0]); *out++=b; return simdpackwn1((unsigned *)(in+1), n-1, b, in[0], (unsigned *)out); }
+ }
+}
+
+#define UNDELTA(__out, __n, __mode) { unsigned _x,_v; for(_x = __out[0],_v=1;_v<__n;_v++) __out[_v] = (_x += __out[_v] + __mode); }
+
+unsigned char *besdec(unsigned char *in, size_t n, unsigned *out, int id, int mode) { unsigned b,x,v;
+ switch(id) {
+ case P_CPY: in = u32dec( (unsigned *)in, n, out); break;
+ //------------- Variable byte ----------------------------------------------
+ case P_VB: in = vbdec( in, n, out); UNDELTA(out, n, mode); break;
+
+ case P_VBL: in = vbytedec( in, n, out); UNDELTA(out, n, mode); break;
+ case P_VBP: in = vbpolydec( in, n, out); UNDELTA(out, n, mode); break;
+ case P_VG8: in = vintg8dec( in, n, out); UNDELTA(out, n, mode); break;
+ //------------- Simple family ----------------------------------------------
+ case P_SV: vbgeta(in, x, *out = x); in = vsdec32( in, n-1, out+1); UNDELTA(out, n, mode); break;
+
+ case P_S16: vbgeta(in, x, *out = x); in = vs16dec((unsigned *)in, n-1, out+1); UNDELTA(out, n, mode); break;
+ case P_S64: vbgeta(in, x, *out = x); in = vs8bdec( in, n-1, out+1); UNDELTA(out, n, mode); break;
+ // ------------ PFor -------------------------------------------------------
+ case P_P4D: vbgeta(in, x, *out = x); in = p4ddec32( in, n-1, out+1); UNDELTA(out, n, mode); break;
+ case P_P4DR: vbgeta(in, x, *out = x); return mode?p4dfdecx32(in, n-1, x, out+1):p4df0decx32( in, n-1, x, out+1);
+
+ case P_OPTP4:
+ if(n < 129) in = vbdec(in, n, out);
+ else { vbgeta(in, x, *out = x); unsigned all_array[2048]; in = (unsigned char *)detailed_p4_decode(out+1, (unsigned *)in, all_array); }
+ UNDELTA(out, n, mode);
+ break;
+ // --------- bit packing ----------------------------------------
+ case P_PCK: vbgeta(in, x, *out = x); b = *in++; return mode?bitdunpack32( in, n-1, b, x, out+1):bitd0unpack32( in, n-1, b, x, out+1);
+ case P_PCKR: vbgeta(in, x, *out = x); b = *in++; return mode?bitfunpackx32(in, n-1, b, x, out+1):bitf0unpackx32(in, n-1, b, x, out+1);
+
+ case P_SIMDH:
+ if(n < 129) { in = vbytedec(in, n, out); UNDELTA(out, n, mode); }
+ else { vbgeta(in, x, *out = x); b = *in++; in = simdunpackn1((uint32_t *)in, n-1, b, out[0], out+1); }
+ break;
+ }
+ return in;
+}
+
+//--------------------------------------- Zipfian generator --------------------------------------------------------
+int z_cmp(double **a, double **b) {
+ if(*a < *b) return -1;
+ if(*a > *b) return 1;
+ return 0;
+}
+
+void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) {
+ int i,m = x2 - x1 + 1;
+ double prob, cum, *zmap;
+ if(!(zmap = malloc(m*sizeof(zmap[0])))) die("mallo error\n");
+
+ // generate initial set (slow)
+ srand48(1);
+ for(cum = 0.0,i = 0; i < m; i++)
+ cum += 1.0 / pow(i+1, alpha);
+ cum = 1.0 / cum;
+ for(prob = 0.0,i = 0; i < m; i++)
+ zmap[i] = prob += cum / pow(i+1, alpha);
+
+ // use binary search to speed up zipfgen
+ qsort(zmap, m, sizeof(zmap[0]), (int(*)(const void*,const void*))z_cmp);
+ for(i = 0; i < n; i++) {
+ double r = drand48();
+ int l = 0, h = m-1;
+ while(l < h) {
+ int k = (l + h) >> 1;
+ if(r > zmap[k]) l = k + 1;
+ else h = k;
+ }
+ a[i] = x1 + l;
+ }
+ free(zmap);
+}
+
+//----------------------------------------------- Benchmark -------------------------------------------------------------------
+struct libss { int id; char *s;int size; };
struct libss libss[] = {
- { P_CPY, "copy", },
- { P_VB, "TurboVbyte" },
- { P_VBL, "Vbyte FPF" },
- { P_VG8, "vg8iu" },
+ { P_CPY, "Copy", 0 },
+ //---------------- Variable byte ---------------------------------
+ { P_VB, "TurboVbyte", 0 },
+ { P_VBL, "VbyteFPF", 0 },
+ { P_VG8, "VarintG8IU",0 },
+//{ P_VBP, "VBytePoly" },
+ // -------------- Simple family ----------------------------------
+ { P_SV, "SimpleV", 0 },
+ #ifdef USE_SIMPLE_8B
+ { P_S64, "Simple-8b",0 }, //crash on 32 bits?
+ #endif
+ #ifdef USE_SIMPLE16
+ { P_S16, "Simple16", 0 }, //max. 28 bits
+ #endif
+ //--------------- PFor ------------------------------------------
+ #ifndef _WIN32
+ { P_P4DR, "TurboPForDA", 128 }, // actually not working w. mingw
+ #endif
+ { P_P4D, "TurboPFor", 128 },
- { P_SV, "simpleV" },
- { P_S8BO, "simple 8b" },
- { P_S16, "simple16" },
+ #ifdef USE_OPTPFD
+ { P_OPTP4, "OptPFD", 128 }, //max. 28 bits
+ #endif
+ //-------------- Bit Packing ------------------------------
+ { P_PCK, "TurboPack", PACK_SIZE },
+ { P_PCKR, "TurboPackDA", PACK_SIZE },
+ { P_SIMDH, "SIMDPackFPF", 128 },
- { P_P4DR, "TurboPFor DA" },
- { P_P4D, "TurboPFor" },
- { P_OPTP4, "OptP4" },
-
- { P_PCK, "TurboPack" },
- { P_PCKR, "TurboPack DA" },
- { P_SIMDH, "SIMDBitPack FPF" },
{ -1, "" },
};
-//---------------------------------------------------------------------------------------------
-#define MAXT 8
-#define BLK_SIZE 129
-#define MB (1024*1024)
+#define MB 1000000
+int verb = 0, reps = 1<<24, trips = 1, xcheck=1;
+unsigned xbits[33];
+enum { T_DUP, T_UNI, T_TXT, T_BYTE, T_TST };
-int verb = 0, reps = 100000, trips = 3;
-enum { T_ZIPF=1, T_ID };
-
-struct libs { int id,err; char *s,*v; unsigned long long l; double tc,td; };
+struct libs { int id,err,size; char *s,*v; unsigned long long l, c[33]; double tc,td; };
struct libs libs[64];
+int libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; }
int l_cmp(struct libs *a, struct libs *b) {
if(a->l < b->l || a->l == b->l && a->td < b->td) return -1;
@@ -349,149 +379,150 @@ void check(unsigned *in, unsigned n, unsigned *out, char *s) {
}
}
-void print(unsigned long long n, char *s) {
+void stprint() {
+ int m;
+ unsigned long long t=0;
+ for(m = 0; m < 33; m++)
+ t += xbits[m];
+ printf("\ndistribution:");
+ for(m = 0; m < 33; m++)
+ if(xbits[m]) printf("%d:%.2f%% ", m, (double)xbits[m]*100/t); printf("\n");
+}
+
+void print(unsigned long long n, char *s, unsigned long long *u) {
int m, k;
for(k = 0; libs[k].id >= 0; k++);
- qsort(libs, k, sizeof(libs[0]), l_cmp);
-
+ qsort(libs, k, sizeof(libs[0]), (int(*)(const void*,const void*))l_cmp);
+ char *prtname = s?s:""; { unsigned char *p; if((p = strrchr(prtname, '/')) || (p = strrchr(prtname, '\\'))) prtname = p+1;}
for(m = 0; m < k; m++)
if(libs[m].l) {
struct libs *lb = &libs[m];
- printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", s, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n,
+ printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", prtname, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n,
lb->tc>=0.000001?((double)n/1000000.0) / (lb->tc/TM_T):0.0,
lb->td>=0.000001?((double)n/1000000.0) / (lb->td/TM_T):0.0,
lb->s );
+ if(u && verb>3) { printf("\n");for(k = 0; k < 33; k++) if(u[k]) printf("%d:%.1f\t", k, (double)lb->c[k]*100/u[k]); printf("\n"); }
}
}
-//int libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; }
-
-unsigned bench(unsigned *__restrict__ _in, unsigned _inlen, int blksize, unsigned char *__restrict__ _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict__ cpy, int bb) { int m,id,b=bb,i; if(verb) { printf(":%d,", _inlen); fflush(stdout);}
- unsigned cn; tm_t tt0 = tminit();
+unsigned bench(unsigned *__restrict _in, unsigned _inlen, int blksize, unsigned char *__restrict _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict cpy, int bb, int mode ) { int m,id,b=bb,i;
+ if(!_inlen) return 0; if(verb>1) { printf(":%d,", _inlen); fflush(stdout); }
+ unsigned cn; tm_t tt0 = tminit();
for(i = 0; i < 10; i++) memcpy(_out, _in, _inlen);
- for(m = 0; (id=libs[m].id) >= 0; m++) { int r,insize=(id==P_OPTP4)?blksize-1:blksize;
- struct libs *lb = &libs[m]; unsigned cl; if(verb) { printf("%s", libs[m].s);fflush(stdout); } int t,tj; tm_t t0,tc=TM_TMAX,td=TM_TMAX,tt;
- for(t = 0; t < trips; t++) { t0 = tminit();
+ for(m = 0; (id=libs[m].id) >= 0; m++) {
+ blksize = libs[m].size?libs[m].size:blksize;
+ int r,insize=(mode>=0)?blksize+1:blksize;
+ struct libs *lb = &libs[m];
+ unsigned cl,cc[33]; if(verb) printf("%s,%d", libs[m].s, blksize);
+ int t,tj; tm_t t0,tc=TM_TMAX,td=TM_TMAX,tt;
+ for(t = 0; t < trips; t++) { t0 = tminit();
for(r = 0; r < reps; ) {
- cn=cl=0;
- unsigned *in;
- unsigned char *out,*sout; //vsini();
+ for(cl=0; cl<33; cl++) cc[cl]=0; cn=cl=0;
+ unsigned *in;
+ unsigned char *out;
for(out = _out, in = _in; in < _in+_inlen; ) {
unsigned n,inlen = *in++,*ip=in; in += inlen;
- *(unsigned *)out = inlen; out+=4;/*out++=0x5a;*/
- for(;ip < in; ip += n) { n = ip+insize<=in?insize:in-ip; cn += n; unsigned char *sout=out; //printf("%d ", n);
- out = beenc(ip,n,out,id,bb);
- cl +=out-sout;
- } if(out > _out+outsize) { fprintf(stderr, "Overflow error %lld, %lld in %s\n", outsize, (ptrdiff_t)(out - _out), lb->s); exit(0); }
- } r++; if((tt = tmtime() - t0) > tx) break;
- } if(tt < tc) { tc = tt; tj = r; }
- if(tmtime() - tt0 > tx*trips) { /*printf("#");fflush(stdout);*/ /*sleep(1);*/tt0 = tminit(); }
+ *(unsigned *)out = inlen; out += 4; unsigned char *sout = out;
+ for(;ip < in; ip += n) { n = ip+insize <= in?insize:in-ip; cn += n; if(out+5*n > _out+outsize) die("Overflow error %llu, %u in %s\n", outsize, (int)(ptrdiff_t)(out - _out), lb->s);
+ out = mode >= 0?besenc(ip, n, out, id, mode):beenc(ip, n, out, id, bb);
+ }
+ cl += out - sout; cc[bsr32(inlen)] += out - sout;
+ }
+ r++; if((tt = tmtime() - t0) > tx) break;
+ } if(tt < tc) tc = tt, tj = r; //if(tmtime() - tt0 > tx*trips) { sleelp(5); tt0 = tminit(); }
}
- lb->l += cl; lb->tc += tc/tj; memset(cpy, 0xf, _inlen*4); if(verb) { printf("+ ");fflush(stdout);}
- tt0 = tminit();
- for(t = 0; t < trips; t++) { t0 = tminit();
- for(r = 0; r < reps; ) { unsigned *out; unsigned char *in;
+
+ for(t=0; t < 33; ++t) lb->c[t] += cc[t];
+
+ lb->l += cl; lb->tc += (double)tc/tj; memset(cpy, 0xf, _inlen*4); if(verb) { printf("/");fflush(stdout);}
+ tt0 = tminit();
+ for(t = 0; t < trips; t++) { t0 = tminit();
+ for(r = 0; r < reps; ) {
+ unsigned *out; unsigned char *in;
for(out = cpy, in = _out; out < cpy+_inlen;) {
unsigned n,*op, outlen=*(unsigned *)in; in+=4;
*out++ = outlen;
for(op=out,out += outlen; op < out; op += n) {
n = op + insize<=out?insize:out-op;
- in = bedec(in,n,op,id,bb);
+ in = mode>=0?besdec(in,n,op,id, mode):bedec(in,n,op,id,bb);
}
- }
- r++;
- if((tt = tmtime() - t0) > tx)
- break;
- }
- if(tt < td) {
- td = tt;
- tj = r;
- }
- if(tmtime() - tt0 > tx*trips) {
- tt0 = tminit();
- }
- } lb->td += td/tj;
- check(_in, _inlen, cpy, lb->s);
+ }
+ r++; if((tt = tmtime() - t0) > tx) break;
+ } if(tt < td) td = tt, tj = r;
+ //if(tmtime() - tt0 > tx*trips) tt0 = tminit();
+ }
+ lb->td += (double)td/tj;
+ if(xcheck) check(_in, _inlen, cpy, lb->s);
}
return cn;
}
-int z_cmp(double **a, double **b) {
- if(*a < *b) return -1;
- if(*a > *b) return 1;
- return 0;
-}
-
-void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) {
- int i,m = x2 - x1 + 1;
- double prob, cum, *zmap;
- if(!(zmap = malloc(m*sizeof(zmap[0])))) {
- fprintf(stderr, "mallo error\n");
- exit(-1);
- };
-
- srand48(1);
- for(cum =0.0,i = 0; i < m; i++)
- cum += 1.0 / pow(i+1, alpha);
- cum = 1.0 / cum;
- for(prob=0.0,i = 0; i < m; i++)
- zmap[i] = prob += cum / pow(i+1, alpha);
- qsort(zmap, m, sizeof(zmap[0]), (int(*)(const void*,const void*))z_cmp);
-
- for(i = 0; i < n; i++) {
- double r = drand48();
- int l = 0, h = m-1;
- while(l < h) {
- int k = (l + h) >> 1;
- if(r > zmap[k]) l = k + 1;
- else h = k;
- }
- a[i] = x1 + l;
- }
- free(zmap);
+void usage() {
+ fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__);
+ fprintf(stderr, "Usage: icbench [options] [file]\n");
+ fprintf(stderr, "Use zipfian generator when no file specified\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -bNm N = blocksize (default 128) m=k kilobyte ex. -b64k\n");
+ fprintf(stderr, " -cN N = format ordered(0:delta+0,1:delta+1),2=convert text to integer format\n");
+ fprintf(stderr, " -eS N = encoder scheme (default all)\n");
+ fprintf(stderr, " -tN N = time in seconds per interation\n");
+ fprintf(stderr, " -TN N = Iterations (default 3)\n");
+ fprintf(stderr, " -vN N = verbosity 1..3\n");
+ fprintf(stderr, "----- file specified --------------\n");
+ fprintf(stderr, " -rN N = max. file size to read\n");
+ fprintf(stderr, "Ex. ./icbench -c1 gov2.sorted\n");
+ fprintf(stderr, "----- file not specified --------------\n");
+ fprintf(stderr, " -aF F = zipfian distribution alpha ex. -a1.0 uniform -a1.5 skewed\n");
+ fprintf(stderr, " -mN N = minimum integer generated in bits\n");
+ fprintf(stderr, " -MN N = maximum integer generated in bits\n");
+ fprintf(stderr, " -nN N = number of integers to generate\n");
+ fprintf(stderr, "Ex. ./icbench -a1.0 -m0 -x8 -n100000000\n");
+ exit(0);
}
#define OVD (10*MB)
-int main(int argc, char *argv[]) {
- char fname[0x100], *cmd=NULL;
- unsigned bp=0,ftype = T_ID, rm=0,rx=30,n=10000000;
- long long rdmax = 1<<30; tm_t tx=1*1000000;
+int main(int argc, char *argv[]) { int r;
+ char fname[0x100], *cmd=NULL;
+ unsigned xbp=0, rm=0,rx=30,n=0;
+ int mode = -1;
+ long long rdmax = 1ull<<32;
double a = 1.5;
-
+ tm_t tx=1*1000000;
+ unsigned blksize = PACK_SIZE;
tminit();
- VarIntG8IU();
-
- int c, digit_optind = 0;
- int this_option_optind = optind ? optind : 1, option_index = 0;
+ VarIntG8IU();
+ int c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0;
static struct option long_options[] = { {"repeat", 0, 0, 'r'}, {0,0, 0, 0} };
for(;;) {
- if((c = getopt_long(argc, argv, "Ac:TBR:ys:r:n:b:c:e:t:r:M:v:m:x:a:", long_options, &option_index)) == -1) break;
+ if((c = getopt_long(argc, argv, "BshHa:b:c:e:f:m:n:r:R:T:v:M:", long_options, &option_index)) == -1) break;
switch(c) {
- case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break;
- case 'r': reps = atoi(optarg); break;
- case 'R': trips = atoi(optarg); break;
- case 'v': verb = atoi(optarg);verb++; break;
- case 't': tx = atoi(optarg)*1000000; break;
- case 'c': ftype = atoi(optarg); break;
- case 'b': rdmax = atoi(optarg)*MB; break;
- case 'e': cmd=optarg; break;
- case 'm': rm = atoi(optarg); break;
- case 'x': rx = atoi(optarg); break; //
- case 'B': bp++; break;
- case 'n': n = atoi(optarg); break;
- case 'a': a = strtod(optarg, NULL); break;
- default: fprintf(stdout,"unknown option: %c \n", optopt); exit(1);
+ case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break;
+ case 'a': a = strtod(optarg, NULL); break;
+ case 'b': { char *p; blksize = strtol(optarg, &p, 10); if(*p == 'k' || *p == 'K') blksize *= 1024; if(blksize>BLK_SIZE) blksize = BLK_SIZE; } break;
+ case 'c': mode = atoi(optarg); break;
+ case 'f': rdmax = atoi(optarg)*MB; break;
+ case 'h': usage(); break;
+ case 'H': xcheck=0; break;
+ case 'e': cmd = optarg; break;
+ case 'm': rm = atoi(optarg); break;
+ case 'n': { char *p; n = strtol(optarg, &p, 10); if(*p == 'k' || *p == 'K') n *= 1000; else if(*p == 'b' || *p == 'B') n *= 1000000000; else n *= 1000000; } break;
+ case 'r': reps = atoi(optarg); break;
+ case 'R': trips = atoi(optarg); break;
+ case 't': tx = atoi(optarg)*1000000; break;
+ case 'v': verb = atoi(optarg); break;
+ case 'M': rx = atoi(optarg); break;
+ default: usage();
}
}
- int fno,i=0; //libini();
- if(!bp) { rm = (1< n) rx = n; } else if(!rm) rm = 1;
- //printf("range=(%d,%d,%d)\n", rm, rx, n);fflush(stdout);
+ int fno,i=0;
+ if(!xbp) { rm = (1< n) rx = n; } else if(!rm) rm = 1; //printf("range=(min=%u, max=%u)\n", rm, rx);fflush(stdout);
+ // build the test functions set
struct libss *ls;
- if(cmd) {
- unsigned char *q=NULL;
+ if(cmd) {
+ char *q = NULL;
for(i=0,libs[0].id = -1;;) {
if(cmd) {
- if(!*cmd) break; //printf("cmd='%s'", cmd);
+ if(!*cmd) break;
q = strchr(cmd,',');
if(q) *q=' ';
if(q = strchr(cmd,'/'))
@@ -499,34 +530,32 @@ int main(int argc, char *argv[]) {
for(ls = libss; ls->id >= 0; ls++)
if(!strcasecmp(ls->s, cmd)) {
memset(&libs[i], 0, sizeof(struct libs));
- libs[i].id = ls->id;
- libs[i].err = 0;
- libs[i].s = ls->s;
- libs[i++].v = ls->v;
+ libs[i].id = ls->id;
+ libs[i].err = 0;
+ libs[i].s = ls->s;
+ libs[i++].size = ls->size; if(verb) printf("%s/", ls->s);fflush(stdout);
break;
}
- if(ls->id < 0) {
- printf("library: '%s' not found\n", cmd);
- exit(-1);
- }
+ if(ls->id < 0) die("library: '%s' not found\n", cmd);
cmd = q?(q+1):"";
}
}
} else for(ls = libss; ls->id >= 0; ls++) {
- libs[i].id = ls->id;
- libs[i].err = 0;
- libs[i].s = ls->s; //printf("%s\n", ls->s);fflush(stdout);
- libs[i++].v = ls->v;
+ libs[i].id = ls->id;
+ libs[i].err = 0;
+ libs[i].s = ls->s; if(verb) printf("%s/", ls->s);fflush(stdout);
+ libs[i++].size = ls->size;
}
- libs[i].id = -1;
-
- if(argc <= optind) {
- unsigned *in, *out, *cpy,*ip; unsigned long long totlen=0;
- in = malloc(n*4+OVD); if(!in) { printf("malloc err=%u", n); exit(0); }
- out = malloc(n*4+OVD); if(!out) { printf("malloc err=%u", n); exit(0); }
- cpy = malloc(n*4+OVD); if(!cpy) { printf("malloc err=%u", n); exit(0); }
+ libs[i].id = -1; if(verb) printf("\n");
+
+ if(argc <= optind) { // No file specified
+ if(!n) n = 100000000; if(rx > n) rx = n;
+ unsigned *in, *cpy,*ip; unsigned char *out; unsigned long long totlen=0;
+ in = malloc(n*4+OVD); if(!in) die("malloc err=%u", n);
+ out = malloc(n*4+OVD); if(!out) die("malloc err=%u", n);
+ cpy = malloc(n*4+OVD); if(!cpy) die("malloc err=%u", n);
char s[33]; s[0]=0;
- if(bp) {
+ if(mode == T_TST) { // Unit test for fixed bit sizes
int b;
printf("bittest\n"); fflush(stdout);
for(b = rm; b <= rx; b++) {
@@ -534,84 +563,100 @@ int main(int argc, char *argv[]) {
*in = n;
for(i = 1; i <= n; i++)
in[i] = (1ull << b)-1;
- totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, b);
- print(totlen, s);
+ totlen = bench(in+1, n, blksize, out, n*4+OVD, s, tx, cpy, b, mode);
+ print(totlen, s, NULL);
}
- } else {
- printf("zipf a=%3.1f [%u,%u]\n", a, rm, rx);
+ } else { // Benchmark w. generated data
+ printf("zipf alpha=%3.1f range[%u..%u].\nbit size histogramm: ", a, rm, rx);
*in = n;
- zipfgen(in+1, a, rm, rx, n); //stprint();
- totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, -1);
- print(totlen, s);
+ zipfgen(in+1, a, rm, rx, n); for(i = 1; i <= n; i++) xbits[bsr32(in[i])]++; stprint();
+ if(mode>=0) { unsigned *ip=in+1; int v; for(v = 1; v < n; v++) { ip[v] += ip[v-1] + mode; if(ip[v]>(1u<<28)) die("overflow generating sorted array\n" ); } }
+ totlen = bench(in, n+1, blksize, out, n*4+OVD, s, tx, cpy, -1, mode);
+ print(totlen, s, NULL);
}
free(in);
free(cpy);
free(out);
- } else for(fno = optind; fno < argc; fno++) {
+ } else for(fno = optind; fno < argc; fno++) { // Benchmark w. specified data files
+ libini();
char *inname = argv[fno];
- FILE *fi = fopen64(inname, "r");
- if(!fi) {
- fprintf(stderr, "open error '%s'", inname); perror(inname);
- exit(-1);
- }
- fseek(fi, 0, SEEK_END);
- unsigned long long fisize = ftell(fi);
- fseek(fi, 0, SEEK_SET);
- if(fisize > rdmax)
- fisize = rdmax;
- fisize /= 4; //setvbuf(fi, NULL, _IOFBF, 1000*MB);
- unsigned *in, *out, *cpy,*ip;
- unsigned long long totlen=0;
- int rc;
- out = malloc(fisize*4+OVD); if(!out) { printf("malloc err=%u", fisize); exit(0); }
- cpy = malloc(fisize*4+OVD); if(!cpy) { printf("malloc err=%u", fisize); exit(0); }
- in = malloc(fisize*4+1024); if(!in) { printf("malloc err=%u", fisize); exit(0); } PGM_FD(fileno(fi));
- int r; fread(&r, 4, 1, fi);
- while(r > 0) {
- for(ip = in; ip+r <= in+fisize;) {
- int rc; PGM_FDPUT(fileno(fi));
- if((rc = fread(ip+1, 4, r, fi)) <= 0)
- goto a;
-
- if(r >= rm && r <= rx) {
- *ip++ = r;
- int j;
- if(verb)
- printf("%d ", r, ftype==T_ID?"I":"N");
- fflush(stdout);
- if(ftype == T_ID) {
- for(j = 0; j < r; ) {
- unsigned m = j+BLK_SIZE>r?r-j:BLK_SIZE;
- int i,did,dido = -1;
- for(i = 0; i < m; i++) {
- did = ip[i];
- if(did < dido) {
- printf("IDs in '%s' not sorted.did=%d,dido=%d ", inname, did, dido);
- exit(0);
- }
- ip[i] = did - dido - 1;
- dido = /*ip[0]*/did; //printf("%d,", ip[i]); xbits[bsr32(ip[i])]++;
- }
- j += m; ip += m; //printf("\r");
- }
- } else
- ip += r;
+ if(mode == T_TXT || mode == T_BYTE) { //------------ convert text file to integer array format
+ FILE *fi = fopen(inname, "r"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); }
+ char outname[257]; strcpy(outname, inname); strcat(outname, ".dat");
+ FILE *fo = fopen(outname, "wb"); if(!fo) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); }
+ #define LSIZE 16
+ char s[LSIZE+1];
+ unsigned num = 0;
+ fwrite(&num, 1, 4, fo);
+ if(mode == T_TXT) {
+ while(fgets(s, LSIZE, fi)) {
+ s[strlen(s) - 1] = 0;
+ unsigned i = strtoul(s, NULL, 10);
+ fwrite(&i, 1, 4, fo);
+ num++;
+ }
+ } else {
+ unsigned u;
+ unsigned char c;
+ while(fread(&c, 1, 1, fi)>0){
+ u = c;
+ fwrite(&u, 1, 4, fo);
+ num++;
}
- r = rc = 0;
- if(ftype == T_ID)
- rc = fread(&r, 4, 1, fi);
- if(rc <= 0 || !r)
- break;
}
- totlen += bench(in, ip-in, BLK_SIZE, out, fisize*4+OVD, inname, tx, cpy, -1);
- if(totlen > n)
- break;
- }
- a:fclose(fi); //stprint();
- print(totlen,inname);
- free(in);
- free(cpy);
- free(out);
- }
-}
+ fseeko(fo, 0, SEEK_SET);
+ fwrite(&num, 1, 4, fo); printf("num=%u\n", num);
+ fclose(fo);
+ fclose(fi);
+ continue;
+ }
+ // process integer array file
+ FILE *fi = fopen64(inname, "rb");
+ if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); }
+ fseeko(fi, 0, SEEK_END);
+ unsigned long long fisize = ftello(fi); fseeko(fi, 0, SEEK_SET); //printf("fisize=%llu\n", fisize);
+ if(fisize > rdmax) fisize = rdmax;
+ fisize /= 4;
+
+ unsigned *in, *cpy,*ip,num; unsigned char *out;
+ unsigned long long outsize=fisize*5+OVD,totlen=0,bitslen[33]={0};
+ out = malloc(outsize); if(!out) die("malloc err=%llu", fisize);
+ cpy = malloc(fisize*4+OVD); if(!cpy) die("malloc err=%llu", fisize);
+ in = malloc(fisize*4+1024); if(!in) die("malloc err=%llu", fisize);
+
+ ip = in;
+ while(fread(&num, 1, 4, fi) == 4 && num) { //printf("?");fflush(stdout);
+ if(num < rm || num > rx) { fseeko(fi, num*4, SEEK_CUR); continue; }
+ if(ip+num > in+fisize) {
+ totlen += bench(in, ip-in, blksize, out, outsize, inname, tx, cpy, -1, mode); printf("#%u", (unsigned)(totlen/1000000));fflush(stdout);
+ if(n && totlen > n)
+ break;
+ ip = in;
+ }
+ *ip++ = num; if(fread(ip, 4, num, fi) != num) break;
+ bitslen[bsr32(num)] += num*4;
+ #ifdef STATS
+ unsigned *ep = ip+num,insize=(mode>=0)?blksize+1:blksize;
+ while(ip < ep) {
+ unsigned m = min(ep-ip, insize),i;
+ if(mode >= 0) {
+ for(i = 1; i < m; i++) {
+ if(verb>3) printf(":%u ", ip[i]);fflush(stdout);
+ xbits[bsr32((ip[i] - ip[i-1]) - mode)]++;
+ if(ip[i] < ip[i-1]+mode) die("IDs in '%s' not sorted.[did=%u,%u] at line=%d\n", inname, ip[i], ip[i-1], (int)(ip-in));
+ }
+ } else for(i = 0; i < m; i++) xbits[bsr32(ip[i])]++;
+ ip += m;
+ }
+ #else
+ ip += num;
+ #endif
+ }
+ a:fclose(fi);
+ totlen += bench(in, ip-in, blksize, out, outsize, inname, tx, cpy, -1, mode);
+ printf("#%u", (unsigned)(totlen/1000000));
+ free(in); free(cpy); free(out);
+ stprint(); print(totlen,inname, bitslen);
+ }
+}
diff --git a/idx.h b/idx.h
new file mode 100644
index 0000000..18282f3
--- /dev/null
+++ b/idx.h
@@ -0,0 +1,20 @@
+#include
+
+#define BLK_DIDNUM (128+1) // // Block size 128 + 1 (1 stored in skips)
+
+
+// Compression method. Set only one METHOD!
+ // compressed size for 62 GB clueweb09.sorted
+ // Defaut is bitpack/bitunpack 18 GB
+#define USE_SIMDPACK // SIMD Bitpacking 18 GB
+//#define USE_TURBOPFOR // for compact version 12 GB
+//#define USE_TURBOPACKD
+
+//-------------------------- Mapping term id <-> posting offset in file ----------------------------------
+typedef struct { uint8_t offseth; uint32_t offsetl; } __attribute__ ((packed)) tmap_t; // 40 bits offsets -> 1 Terabyte
+
+#define TIDMAPSET(__t, __ofs) { (__t)->offseth = (__ofs)>>32; (__t)->offsetl = (__ofs) & 0xffffffff; }
+#define TIDMAPGET(__t) ((__off64_t)(__t)->offseth << 32 | (__t)->offsetl)
+#define TIDMAP(__fdm, __tid) ({ char *_bp = __fdm; tmap_t *_t = (tmap_t *)&_bp[(__tid)*sizeof(tmap_t)]; TIDMAPGET(_t); })
+//--------------------------------------------------------------------------------------------------------
+
diff --git a/idxcr.c b/idxcr.c
new file mode 100644
index 0000000..37815f6
--- /dev/null
+++ b/idxcr.c
@@ -0,0 +1,153 @@
+/**
+ Copyright (C) powturbo 2013-2014
+ GPL v2 License
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ - email : powturbo [AT] gmail.com
+ - github : https://github.com/powturbo
+ - homepage : https://sites.google.com/site/powturbo/
+ - twitter : https://twitter.com/powturbo
+
+ idxcr.c - "Integer Compression" Create inverted index for using by idxqry for benchmarking
+**/
+#define _LARGEFILE64_SOURCE 1
+#define _FILE_OFFSET_BITS 64
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include "vint.h"
+#include "vp4dc.h"
+
+#include "bitpack.h"
+#include "idx.h"
+//-------------------------------------- Simdcomp --------------------------------------------------------------------------
+#include "ext/simdcomp/include/simdbitpacking.h"
+
+unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {
+ uint32_t *in_;
+ for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b);
+ return (unsigned char *)out;
+}
+unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {
+ uint32_t *in_;
+ for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b);
+ return (unsigned char *)out;
+}
+//---------------------------------------------------------------------------------------------------------------
+#define DELTA( __in, __n, __b) { unsigned _v; for(__b=0,_v = __n-1; _v > 0; --_v) __in[_v] = (__in[_v] - __in[_v-1]) - 1, __b |= __in[_v]; __b = bsr32(__b); }
+
+#define TERMNUM 2000000
+int verb;
+
+void usage() {
+ fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__);
+ fprintf(stderr, "https://github.com/powturbo/TurboPFor\n\n");
+ fprintf(stderr, "Create inverted index from 'Document identifier data set' format\n");
+ fprintf(stderr, "See http://lemire.me/data/integercompression2014.html'\n");
+ fprintf(stderr, "Usage: idxcr \n");
+ fprintf(stderr, "ex. idxcr clueweb09.sorted idxdir\n\n");
+ exit(-1);
+}
+
+int main(int argc, char *argv[]) {
+ int fno,c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0; unsigned char *path="";
+ static struct option long_options[] = { {"r", 0, 0, 'r'}, {0,0, 0, 0} };
+ for(;;) {
+ if((c = getopt_long(argc, argv, "xv:", long_options, &option_index)) == -1) break;
+ switch(c) {
+ case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break;
+ case 'v': verb = atoi(optarg); break;
+ default: die("unknown option: %c \n", optopt);
+ }
+ }
+ if(argc - optind < 2) usage();
+ tmap_t *tmap = malloc(TERMNUM*sizeof(tmap_t)); if(!tmap) die("malloc error\n");
+ path = argv[--argc];
+
+ for(fno = optind; fno < argc; fno++) {
+ char outname[257], *inname = argv[fno];
+ strcpy(outname, path);
+ unsigned char *p = strrchr(inname,'/');
+ if(!p) p = strrchr(inname,'\\'); if(!p) p=inname;
+ strcat(outname, p); strcat(outname,".i");
+
+ FILE *fi = fopen64(inname, "rb"); if(!fi) { fprintf(stderr, "open error '%s'", inname); perror(inname); exit(-1); } int fdi = fileno(fi);
+ FILE *fo = fopen64(outname,"wb"),*fm; if(!fo) { fprintf(stderr, "creat error '%s'", outname); perror(outname); exit(-1); } fprintf(stderr, "file='%s'", outname);
+ fseeko(fo, sizeof(unsigned)+sizeof(unsigned long long), SEEK_SET);
+
+ unsigned *in = NULL,*ip,*ep,num,tid=0,numx=0,outsize;
+ unsigned char *out = NULL;
+ unsigned long long fofs;
+
+ while(fread(&num, 1, 4, fi) == 4 && num) { // read number of docid in term
+ unsigned bnum = (num+BLK_DIDNUM-1)/BLK_DIDNUM;
+ if(num > numx) { numx = num;
+ in = realloc(in, num*4+64);
+ outsize = num*4+bnum*sizeof(unsigned)*2+1024;
+ out = realloc(out, outsize);
+ if(!in || !out) die("malloc err=%u", num);
+ }
+
+ if(fread(in, 4, num, fi) != num) break; // read docid list
+ unsigned char *op = out,*_op;
+ vbput(op, num); // store f_t
+
+ unsigned *pix = (unsigned *)op;
+ if(num > BLK_DIDNUM) op += bnum*sizeof(unsigned)*2;
+ for(_op = op, ip = in, ep = ip+num; ip < ep; ) {
+ if(num > BLK_DIDNUM) { // skip/index. docid[0] and offset to compressed block
+ *pix = ip[0]; // First docid
+ pix[bnum] = op-_op; // offset
+ pix++;
+ } else vbput(op, ip[0]); // skip not needed
+
+ unsigned n = min(ep-ip, BLK_DIDNUM),b=0; if(op+5*n > out+outsize) die("output buffer too small\n");
+ if(n > 1) {
+ DELTA(ip, n, b);
+ #ifdef USE_SIMDPACK
+ if(n < 129) { *op++ = b; op = bitpack32( ip+1, n-1, b, op); } //op = vbenc(ip+1, n-1, op);
+ else { *op++ = b; op = simdpackwn(ip+1, n-1, b, (unsigned *)op); }
+ #elif defined(USE_TURBOPFOR)
+ op = p4denc32( ip+1, n-1, op);
+ #else
+ *op++ = b; op = bitpack32(ip+1, n-1, b, op);
+ #endif
+ }
+ ip += n;
+ }
+ fofs = ftello(fo);
+ tmap_t *t = &tmap[tid++];
+ TIDMAPSET(t, fofs);
+ if(fwrite(out, 1, op-out, fo) < 0) die("fwrite error\n");
+ }
+ fofs = ftello(fo); // write termmap
+ if(fwrite(tmap, 1, tid*sizeof(tmap_t), fo) < 0) die("fwrite error\n");
+
+ fseeko(fo, 0, SEEK_SET);
+ if(fwrite(&fofs, 1, sizeof(unsigned long long), fo) < 0) die("fwrite error\n");
+ if(fwrite(&tid, 1, sizeof(unsigned), fo) < 0) die("fwrite error\n");
+
+ fclose(fi); fclose(fo);
+ if(in) { free(in); free(out); }
+ }
+}
diff --git a/idxqry.c b/idxqry.c
new file mode 100644
index 0000000..06365a7
--- /dev/null
+++ b/idxqry.c
@@ -0,0 +1,364 @@
+/**
+ Copyright (C) powturbo 2013-2014
+ GPL v2 License
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+ - email : powturbo [AT] gmail.com
+ - github : https://github.com/powturbo
+ - homepage : https://sites.google.com/site/powturbo/
+ - twitter : https://twitter.com/powturbo
+**/
+#define _LARGEFILE64_SOURCE 1
+#define _FILE_OFFSET_BITS 64
+#include
+#include
+#include
+#include
+#include
+#include
+ #ifndef _WIN32
+#include
+#include
+#include
+#include
+ #endif
+#include
+
+#include "conf.h"
+#include "vint.h"
+#include "bitunpack.h"
+#include "vp4dd.h"
+#include "idx.h"
+
+#define STATS
+//---------------------------------------- Time ---------------------------------------------------------------------
+typedef unsigned long long tm_t;
+#define TM_TMAX (1ull<<63)
+
+#include
+#define TM_T 1000000.0
+static tm_t tmtime(void) { struct timeval tm; gettimeofday(&tm, NULL); return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; }
+static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; }
+static double tmsec( tm_t tm) { return (double)tm/1000000.0; }
+static double tmmsec(tm_t tm) { return (double)tm/1000.0; }
+
+//--------------------------------------- Simdcomp -------------------------------------------------------------------
+#include "ext/simdcomp/include/simdbitpacking.h"
+unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {
+ uint32_t k, *out_;
+ for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack((const __m128i *)in, out, b);
+ return (unsigned char *)in;
+}
+unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {
+ uint32_t k, *out_;
+ for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, in, out, b);
+ return (unsigned char *)in;
+}
+
+//------------------------------------- index file (created by idxcr) -------------------------------------------------------------
+typedef struct { // Index
+ unsigned char *fdp, // posting
+ *fdm; // mapping term id to offset in posting
+ unsigned long long fdsize;
+ unsigned tnum;
+} idxrd_t;
+
+int idxopen(idxrd_t *idx, char *s) {
+ int fd; char *p;
+ if((fd = open(s, O_RDONLY| O_LARGEFILE)) < 0)
+ die("can't open index file '%s' rc=%d:%s\n", s, errno, strerror(errno));
+ struct stat sbuf; // Memory mapped access
+ fstat(fd, &sbuf);
+ if(sbuf.st_size > 0 && (p = mmap( NULL, sbuf.st_size , PROT_READ, MAP_SHARED|MAP_NORESERVE, fd, 0)) == (void *)-1)
+ die("mmap errno=%d,'%s'\n", errno, strerror(errno) );
+ close(fd);
+
+ idx->fdsize = sbuf.st_size;
+ idx->fdp = p;
+ idx->fdm = p + *(uint64_t *)p; p += sizeof(uint64_t); // Termid map table. Termid->Posting
+ idx->tnum = *(unsigned *)p;
+ return 0;
+}
+
+int idxclose(idxrd_t *idx) {
+ munmap(idx->fdp, idx->fdsize);
+}
+
+//--------------------------------- Posting --------------------------------------------------------------
+#ifdef STATS
+unsigned long long st_tot,st_dec;
+#define STATINI st_tot=st_dec=0
+#define STAT(a) a
+#else
+#define STATINI
+#define STAT(a)
+#endif
+
+typedef struct {
+ unsigned char *bp,*p;
+ unsigned f_t,_f_t, did,ldid;
+ int didno,didnum, bno, bnum;
+} post_t;
+
+// Init posting for term id tid
+int postinit( post_t *post, int tid, idxrd_t *idx, unsigned *dids) {
+ unsigned long long o = TIDMAP(idx->fdm, tid); if(!o) return 0;
+ unsigned char *p = idx->fdp + o; // start of posting;
+ post->f_t = vbget(p); // num docs
+ post->bnum = (post->f_t+BLK_DIDNUM-1)/BLK_DIDNUM; // num blocks
+ post->_f_t = post->f_t;
+ post->didno = post->bno = -1;
+ post->bp = p; // start skip block
+ post->p = p + post->bnum*sizeof(unsigned)*2; // start posting block
+ dids[0] = INT_MAX;
+ post->ldid = 0; post->did = -1;
+ post->didnum = min(post->f_t,BLK_DIDNUM); STAT(st_tot += post->f_t);
+ if(post->f_t <= BLK_DIDNUM) post->bno=post->bnum;
+ return post->f_t;
+}
+
+// Get next docid. Return value >= INT_MAX at end of posting
+static inline ALWAYS_INLINE unsigned postnext(post_t *post, unsigned *dids) {
+ if((post->did += dids[++post->didno] + 1) < INT_MAX) return post->did;
+
+ unsigned char *p = post->bp;
+ if(post->f_t > BLK_DIDNUM) {
+ if(++post->bno >= post->bnum) return INT_MAX;
+ unsigned *pix = (unsigned *)p + post->bno;
+ dids[0] = *pix; // first did in block
+ p = post->p + pix[post->bnum]; // o=offset to posting block
+ } else dids[0] = vbget(p);
+
+ post->didnum = min(post->_f_t, BLK_DIDNUM);
+ post->_f_t -= post->didnum; //STAT(st_dec+=post->didnum);
+ if(post->didnum > 1) {
+ #if defined(USE_SIMDPACK)
+ unsigned b = *p++;
+ if(post->didnum < 129) p = bitunpack32(p, post->didnum-1, b, &dids[1]); //p = vbdec(p, post->didnum-1, &dids[1]);
+ else { p = simdunpackn( (unsigned *)p, post->didnum-1, b, &dids[1]); }
+ #elif defined(USE_TURBOPFOR)
+ p = p4ddec32( p, post->didnum-1, &dids[1]);
+ #else
+ unsigned b = *p++; p = bitunpack32(p, post->didnum-1, b, &dids[1]);
+ #endif
+ }
+ dids[post->didnum] = INT_MAX;
+ post->didno = 0;
+ return post->did = dids[0];
+}
+
+// Get next docid equal or greater than the parameter did
+static inline ALWAYS_INLINE unsigned postget(post_t *post, unsigned did, unsigned *dids) {
+ if(did < post->ldid) { // pending dids
+ for(;;) {
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ }
+ if(post->did < INT_MAX) return post->did;
+ }
+
+ unsigned char *p = post->bp; //Skip index
+ if(post->f_t > BLK_DIDNUM) {
+ unsigned *_q = (unsigned *)p,*q=_q+(++post->bno),*qe=_q+post->bnum-1;
+ for(;;) {
+ if(q[1] >= did || q >= qe) break; q++;
+ if(q[1] >= did || q >= qe) break; q++;
+ if(q[1] >= did || q >= qe) break; q++;
+ if(q[1] >= did || q >= qe) break; q++;
+ }
+ post->bno = q - _q;
+ if(q < qe) {
+ if(did < _q[0]) { post->bno=-1;post->ldid = _q[0]; return _q[0]; }
+ post->ldid = q[1];
+ } else {
+ post->ldid = INT_MAX;
+ post->didnum = post->f_t - post->bno*BLK_DIDNUM;
+ q = qe;
+ }
+ post->bno = q-_q;
+ dids[0] = post->did = *q; // first did in block
+ p = post->p+q[post->bnum]; // o=offset to posting block
+ } else {
+ post->ldid = INT_MAX;
+ dids[0] = post->did = vbget(p);
+ }
+ STAT(st_dec+=post->didnum);
+ if(post->didnum > 1) {
+ #if defined(USE_SIMDPACK)
+ unsigned b = *p++;
+ if(post->didnum < 129) p = bitunpack32(p, post->didnum-1, b, &dids[1]); //p = vbdec(p, post->didnum-1, &dids[1]);
+ else { p = simdunpackn( (unsigned *)p, post->didnum-1, b, &dids[1]); }
+ #elif defined(USE_TURBOPFOR)
+ p = p4ddec32( p, post->didnum-1, &dids[1]);
+ #else
+ unsigned b = *p++; p = bitunpack32(p, post->didnum-1, b, &dids[1]);
+ #endif
+ }
+ dids[post->didnum] = INT_MAX;
+ for(post->didno=0; ; ) {
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ if(post->did >= did) break; post->did += dids[++post->didno]+1;
+ }
+ return (post->did >= INT_MAX)?post->ldid:post->did;
+}
+//----------------------------------------- query search ------------------------------------------
+#define TERMNUM 32
+
+typedef struct {
+ int term[TERMNUM], terms, id;
+} qry_t;
+
+int postcmp(post_t *a, post_t *b) {
+ if(a->f_t < b->f_t) return -1;
+ if(a->f_t > b->f_t) return 1;
+ return 0;
+}
+
+int intersec_max;
+
+unsigned idxsearch(idxrd_t *idx, qry_t *q) {
+ int f_t = 0, i;
+ post_t *p, *pe, post[TERMNUM];
+ unsigned did, elim, dids[TERMNUM][BLK_DIDNUM+31];
+
+ if(q->terms == 1) { // 1 Term query
+ if(!(f_t = postinit(post, q->term[0], idx, dids[0])))
+ return 0;
+ for(i = 0; i < min(f_t,intersec_max); i++) {
+ if((did = postnext(post, dids[0])) >= INT_MAX) break;
+ f_t++;
+ }
+ } else if(q->terms == 2) { // optimized 2 terms query
+ if(!postinit(&post[0], q->term[0], idx, dids[0]) || !postinit(&post[1], q->term[1], idx, dids[1]))
+ return 0;
+ if(post[1].f_t < post[0].f_t) { post_t t = post[0]; post[0] = post[1]; post[1] = t; } // swap
+ for(elim=did=0,f_t=0;;) {
+ if(unlikely((did = postget(&post[0], did, dids[0])) >= INT_MAX)) break;
+ if(( elim = postget(&post[1], did, dids[1])) == did) {
+ if(++f_t >= intersec_max) break;
+ did++;
+ continue;
+ } else if(elim >= INT_MAX) break;
+ did = elim;
+ }
+ } else { // multiple terms conjunctive query
+ pe = &post[q->terms];
+ for(p = post; p < pe; p++)
+ if(!postinit(p, q->term[p-post], idx, dids[p-post])) return 0;
+ qsort(post, q->terms, sizeof(post[0]), (int(*)(const void*,const void*))postcmp); // sort by f_t
+
+ for(did = 0;;did++) {
+ a:if(unlikely((did = postget(post, did, dids[0])) >= INT_MAX)) return f_t;
+ for(p = &post[1]; p < pe; p++) {
+ if((elim = postget(p, did, dids[p-post])) == did) continue;
+ if(elim >= INT_MAX) return f_t;
+ did = elim;
+ goto a;
+ }
+ if(++f_t >= intersec_max) break;
+ }
+ }
+ return f_t;
+}
+
+//------------------------------ Test + Benchmark ----------------------------------------------------
+#define QRYLEN 255
+int qline, temin = 1,temax = TERMNUM,tex=0,qmax=1<<30;
+unsigned long long qrybatch(idxrd_t *idx, char *fqname, int *qid) {
+ char s[QRYLEN+1],*p,*q;
+ int id=0;
+ unsigned long long f_t=0;
+ FILE *fq;
+
+ if(!(fq = fopen(fqname, "r+")))
+ die("can't open file '%s'\n", fqname);
+
+ while(fgets(s, QRYLEN, fq)) { ++qline;
+ s[strlen(s)-1]=0;
+ qry_t qry;
+ for(qry.terms=0,p=s; *p && qry.terms < TERMNUM; ) {
+ while(*p && (*p < '0' || *p > '9')) p++; if(!*p) break;
+ q = p; while(*p >= '0' && *p <= '9') p++;
+ qry.term[qry.terms++] = strtol(q, NULL, 10);
+ }
+ if(qry.terms >= temin && qry.terms <= temax) { //int j; for(j=0;j < qry.terms;j++) { if(j) printf(" "); printf("%u", qry.term[j]); } printf(" %d \n", qry.terms);
+ qry.id = ++id; tex = max(qry.terms,tex);
+ f_t += idxsearch(idx, &qry); if(id >= qmax) break;
+ }
+ }
+ fclose(fq);
+ *qid = id;
+ return f_t;
+}
+
+void usage() {
+ fprintf(stderr, "\nTurboPFor Copyright (c) 2013-2015 Powturbo %s\n", __DATE__);
+ fprintf(stderr, "https://github.com/powturbo/TurboPFor\n\n");
+ fprintf(stderr, "Benchmark for intersections in inverted index\n\n");
+ fprintf(stderr, "Usage: idxqry [options] \n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, " -nN N = max. intersections/query. ex. -n1k=100.000 -n1m=1.000.000\n");
+ fprintf(stderr, " -mN N = minimum query terms (default 1)\n");
+ fprintf(stderr, " -MN N = maximum query terms (default 16)\n");
+ fprintf(stderr, " -rN N = number of iterations (default 3)\n");
+ fprintf(stderr, " -qN N = max. number of queries\n");
+ fprintf(stderr, " index created by 'idxcr' program\n");
+ fprintf(stderr, "Ex. idxqry -n100k -m2 clueweb.sorted.i aol.txt\n");
+ fprintf(stderr, "Ex. idxqry gov2.sorted.i 1mq.txt\n");
+ fprintf(stderr, "8-16 GB RAM recommended\n\n");
+ exit(-1);
+}
+
+int main(int argc, char **argv ) {
+ int reps = 3,i;
+
+ int c, digit_optind = 0, this_option_optind = optind ? optind : 1, option_index = 0;
+ static struct option long_options[] = { {"", 0, 0, 'r'}, {0,0, 0, 0} };
+ for(;;) {
+ if((c = getopt_long(argc, argv, "n:m:M:q:r:", long_options, &option_index)) == -1) break;
+ switch(c) {
+ case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break;
+ case 'q': qmax = atoi(optarg); break;
+ case 'r': reps = atoi(optarg); break;
+ case 'm': temin = atoi(optarg); break;
+ case 'M': temax = atoi(optarg); break;
+ case 'n': { char *p; intersec_max = strtol(optarg, &p, 10); if(*p == 'k' || *p == 'K') intersec_max *= 1000; else if(*p == 'm' || *p == 'M') intersec_max *= 1000000; } break;
+ default: usage();
+ }
+ }
+ if(argc <= optind) usage();
+ if(intersec_max) printf("Max. Intersections/query=%d\n", intersec_max);
+ else intersec_max=1<<30;
+
+ idxrd_t idx;
+ if(idxopen(&idx, argv[optind]))
+ die("can't open idx file '%s'\n", argv[optind]);
+ for(i=0; i < reps; i++) { STATINI;
+ int id; tm_t t0 = tminit();
+ unsigned long long inum = qrybatch(&idx, argv[optind+1], &id ); tm_t t1 = tmtime()-t0;
+ printf("qry=%d/%.2fs. [%.1f q/s] [%.3f ms/q] %llu docs found\n", id, tmsec(t1), (double)id/tmsec(t1), tmmsec(t1)/(double)id, inum );
+ if(i 30) sleep(20);
+ }
+ idxclose(&idx);
+ #ifdef STATS
+ if(st_tot) printf("Terms=[%d-%d] Integers: total=%llu decoded=%llu ratio=%.2f%%\n", temin, tex, st_tot, st_dec, (double)st_dec*100/(double)st_tot);
+ #endif
+}
+
diff --git a/makefile b/makefile
index 0488c6e..1b7b678 100644
--- a/makefile
+++ b/makefile
@@ -1,28 +1,47 @@
-# powturbo (c) Copyright 2007-2013
-CFLAGS=-ffast-math -fstrict-aliasing -march=native -w -fpermissive
+# powturbo (c) Copyright 2007-2015
+CFLAGS=-ffast-math -DNDEBUG -fstrict-aliasing -m64 -march=native
BIT=./
-all: icbench
+all: icbench idxcr idxqry
bitunpack.o: $(BIT)bitunpack.c $(BIT)bitunpack_.h $(BIT)bitunpack.h $(BIT)bitunpack64_.h
- cc -O2 $(CFLAGS) -c $(BIT)bitunpack.c
+ gcc -O3 $(CFLAGS) -c $(BIT)bitunpack.c
bitpack.o: $(BIT)bitpack.c $(BIT)bitpack_.h $(BIT)bitpack.h $(BIT)bitpack64_.h
- cc -O2 $(CFLAGS) -c $(BIT)bitpack.c
+ gcc -O2 $(CFLAGS2) -c $(BIT)bitpack.c
vp4dc.o: $(BIT)vp4dc.c
- cc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c
+ gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dc.c
-SIMDCOMPD=aux/simdcomp/
+vp4dd.o: $(BIT)vp4dd.c
+ gcc -O3 $(CFLAGS2) -funroll-loops -c $(BIT)vp4dd.c
+
+SIMDCOMPD=ext/simdcomp/
SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o
-varintg8iu.o: $(BIT)aux/varintg8iu.c $(BIT)aux/varintg8iu.h
- cc -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)aux/varintg8iu.c
+varintg8iu.o: $(BIT)ext/varintg8iu.c $(BIT)ext/varintg8iu.h
+ gcc -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)ext/varintg8iu.c
-icbench: icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o varintg8iu.o vp4dd.o vp4dc.o $(SIMDCOMP)
- cc -O3 icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o vp4dd.o vp4dc.o varintg8iu.o $(SIMDCOMP) -lm -o icbench $(LFLAGS)
+icbench: icbench.o bitpack.o bitunpack.o vsimple.o vp4dd.o vp4dc.o varintg8iu.o ext/simple8b.o $(SIMDCOMP)
+ gcc -O3 icbench.o bitpack.o bitunpack.o vsimple.o vp4dd.o vp4dc.o varintg8iu.o ext/simple8b.o $(SIMDCOMP) -lm -o icbench $(LFLAGS)
+
+idxcr: idxcr.o bitpack.o $(SIMDCOMP) vp4dc.o vsimple.o
+ gcc -O3 idxcr.o bitpack.o $(SIMDCOMP) vp4dc.o vsimple.o -o idxcr $(LFLAGS)
+
+idxqry: idxqry.o bitunpack.o $(SIMDCOMP) vp4dd.o
+ gcc -O3 idxqry.o bitunpack.o $(SIMDCOMP) vp4dd.o -o idxqry $(LFLAGS)
.c.o:
- cc -O3 $(CFLAGS) $< -c -o $@
+ gcc -O3 $(CFLAGS) $< -c -o $@
+clean:
+ rm *.o
+ rm ext/*.o
+ rm ext/simdcomp/*.o
+ rm ext/simdcomp/src/*.o
+cleanw:
+ del .\*.o
+ del ext\*.o
+ del ext\simdcomp\*.o
+ del ext\simdcomp\src\*.o
diff --git a/vint.h b/vint.h
index 5169b5c..6e7f4ce 100644
--- a/vint.h
+++ b/vint.h
@@ -16,7 +16,7 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
@@ -27,44 +27,62 @@
#ifndef VINT_H
#define VINT_H
#include "conf.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
//-------------------------------------- variable byte : 32 bits ----------------------------------------------------------------
- //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
-static unsigned char vtab[]= { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 };
#define vbvlen(__x) vtab[(__x)&0xf]
#define vbputa(__op, __x, __act) {\
- if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\
+ if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\
else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\
else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\
else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\
else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\
}
-
+
#define vbgeta(__ip, __x, __act) do { __x = *__ip;\
- if(!(__x & (1<<0))) { __x >>= 1; __ip++; __act;}\
- else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\
+ if(!(__x & (1<<0))) { __x >>= 1; __ip++; __act;}\
+ else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\
else if(!(__x & (1<<2))) { __x = (*(unsigned short *)__ip) >> 3 | *(__ip+2) << 13; __ip += 3; __act;}\
- else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\
- else { __x = (*(unsigned *)__ip) >> 4 | *(__ip+4) << 28; __ip += 5; __act;}\
+ else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\
+ else { __x = (*(unsigned *)__ip) >> 4 | *(__ip+4) << 28; __ip += 5; __act;}\
} while(0)
+//------------------------------------------------------------------------------------------------------------------------
+ //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
+static unsigned char vtab[]= { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 };
+// Length of uncompress value. Input __x is the compressed buffer start
-#define vblen(_x_) ({ unsigned __x = _x_; __x > 0x7f?(__x > 0x3fff?(__x > 0x1fffff?(__x > 0x0fffffff?5:4):3):2):1; })
+// Length in bytes of compressed "__x" when using variable byte
+#define vblen(__x) ({ unsigned _x = __x; _x > 0x7f?(_x > 0x3fff?(_x > 0x1fffff?(_x > 0x0fffffff?5:4):3):2):1; })
+
+// compress single value
#define vbput(__op, __x) { unsigned _x__ = __x; vbputa(__op, _x__, ;); }
+// decompress single value
#define vbget(__ip) ({ unsigned _x_; vbgeta(__ip, _x_, ;); _x_; })
-static inline unsigned char *vbenc (unsigned *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) vbput(out, *in++); return out;}
-static inline unsigned char *vbdec (unsigned char *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n,x; while(out < out_) vbgeta(in, x, *out++ = x); return in;}
+// compress array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out
+static inline unsigned char *vbenc (unsigned *__restrict in, int n, unsigned char *__restrict out) { unsigned *in_ = in +n; while(in < in_) vbput(out, *in++); return out;}
+
+// decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in
+static inline unsigned char *vbdec (unsigned char *__restrict in, int n, unsigned *__restrict out) { unsigned *out_ = out+n,x; while(out < out_) vbgeta(in, x, *out++ = x); return in;}
//--------------------------------------- variable byte : 15 bits -------------------------------------------------------------------
-#define vblen16(__x) ((__x) > 0x7f?2:1)
#define vbput16(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0)
#define vbgeta16(__ip,__x, __act) do { if((__x = *__ip++) > 0x7f) __x = (__x & 0x7f) << 8 | *__ip++; __act; } while(0)
+
+#define vblen16(__x) ((__x) > 0x7f?2:1)
#define vbget16(__ip) ({ unsigned _x; vbgeta16(__ip, _x, ;); _x; })
-static inline unsigned char *vbenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned short *in_ = in +n; while(in < in_) vbput16(out, *in++); return out;}
-static inline unsigned char *vbdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out) { unsigned short *out_ = out+n,x; while(out < out_) vgeta16(in, x, *out++ = x); return in; }
+// like vbenc32 but for 16 bits values
+static inline unsigned char *vbenc16(unsigned short *__restrict in, int n, unsigned char *__restrict out) { unsigned short *in_ = in +n; while(in < in_) vbput16(out, *in++); return out;}
+// like vbdec32 but for 16 bits values
+static inline unsigned char *vbdec16(unsigned char *__restrict in, int n, unsigned short *__restrict out) { unsigned short *out_ = out+n,x; while(out < out_) vgeta16(in, x, *out++ = x); return in; }
+#ifdef __cplusplus
+}
#endif
-
-
+#endif
diff --git a/vp4dc.c b/vp4dc.c
index 17d323f..91047c0 100644
--- a/vp4dc.c
+++ b/vp4dc.c
@@ -16,7 +16,7 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
@@ -24,18 +24,22 @@
vp4dd.c - "Integer Compression" Turbo PforDelta
**/
+#include
#include "conf.h"
#include "bitpack.h"
+
#include "vp4dc.h"
#define PAD8(__x) ( (((__x)+8-1)/8) )
#include
+//------------------------------------------
+#define P4DSIZE 128 //64 //
+#define P4DENC p4denc
#define USIZE 32
#include "vp4dc_.h"
+#undef USIZE
#define USIZE 16
#include "vp4dc_.h"
-
-
diff --git a/vp4dc.h b/vp4dc.h
index e23a94b..9c81cb9 100644
--- a/vp4dc.h
+++ b/vp4dc.h
@@ -16,12 +16,21 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- vp4dc.h - "Integer Compression" Turbo PforDelta
+ vp4dc.h - "Integer Compression" TurboPfor (see vp4dd.h for decompression)
**/
-unsigned char *p4denc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out);
-unsigned char *p4denc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out);
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// compress integer array with n values to the buffer out. Return value = end of compressed buffer out
+unsigned char *p4denc32(unsigned *__restrict in, int n, unsigned char *__restrict out);
+unsigned char *p4denc16(unsigned short *__restrict in, int n, unsigned char *__restrict out);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vp4dc_.h b/vp4dc_.h
index 75fd9f3..86ac68c 100644
--- a/vp4dc_.h
+++ b/vp4dc_.h
@@ -16,7 +16,7 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
@@ -24,15 +24,23 @@
vp4dc_.c - "Integer Compression" Turbo PforDelta
**/
#define uint_t TEMPLATE3(uint, USIZE, _t)
+#define P4DN (P4DSIZE/64)
-unsigned char *TEMPLATE2(p4denc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ out) {
- int i; unsigned cnt[USIZE+1] = {0}; uint_t b = 0;
- for(i = 0; i < n; i++) b |= in[i], ++cnt[TEMPLATE2(bsr, USIZE)(in[i])];
+unsigned char *TEMPLATE2(P4DENC, USIZE)(uint_t *__restrict in, int n, unsigned char *__restrict out) { unsigned char *op = out;
+ int i,b=0; unsigned cnt[USIZE+1] = {0}; uint_t *ip;
+
+ for(ip = in; ip < in+(n&~3); ) {
+ ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++;
+ ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++;
+ ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++;
+ ++cnt[TEMPLATE2(bsr, USIZE)(*ip)]; b |= *ip++;
+ }
+ while(ip < in+n) b |= *ip, ++cnt[TEMPLATE2(bsr, USIZE)(*ip++)];
b = TEMPLATE2(bsr, USIZE)(b);
- unsigned xb=b, ml = PAD8(n*b)+1,x = cnt[b];
+ unsigned xb = b, ml = PAD8(n*b)+1,x = cnt[b];
for(i = b-1; i >= 0; i--) {
- unsigned l = PAD8(n*i) + (x?(2+16+PAD8(x*(xb-i))):1);
+ unsigned l = PAD8(n*i) + 2+P4DN*8+PAD8(x*(xb-i));
if(l < ml) b = i, ml = l;
x += cnt[i]; /*if(x >= 64) break;*/
}
@@ -40,9 +48,10 @@ unsigned char *TEMPLATE2(p4denc, USIZE)(uint_t *__restrict__ in, int n, unsigned
*out++ = b << 1;
return TEMPLATE2(bitpack, USIZE)(in, n, b, out);
}
- xb-=b;
- uint_t _in[0x100], inx[0x100]; unsigned miss[0x100];
- unsigned long long xmap[2]; xmap[0] = xmap[1] = 0; unsigned xn, msk = (1ull<> b;
- xmap[c>>6] |= (1ull<<(c&0x3f));
+ xmap[c>>6] |= (1ull << (c&0x3f));
}
- *(unsigned short *)out = xb << 8 | b << 1 | 1; out += 2; out = TEMPLATE2(bitpack, USIZE)(_in, n, b, out);
- *(unsigned long long *)out = xmap[0]; out += 8;
- *(unsigned long long *)out = xmap[1]; out += 8;
- memset(&inx[xn],0,128);
- return TEMPLATE2(bitpack, USIZE)(inx, xn, xb, out);
+ *(unsigned short *)out = xb << 8 | b << 1 | 1; out += 2; out = TEMPLATE2(bitpack, USIZE)(_in, n, b, out);
+ for(i=0;i < P4DN; i++) { *(unsigned long long *)out = xmap[i]; out += 8; } //memset(&inx[xn],0,P4DSIZE);
+ return TEMPLATE2(bitpack, USIZE)(inx, xn, xb, out); //if(op-out >= PAD8(n*b)+1) { printf("Fatal error b=%d,xb=%d\n", b, xb); exit(0); } return out;
}
diff --git a/vp4dd.c b/vp4dd.c
index 2d9e452..855013f 100644
--- a/vp4dd.c
+++ b/vp4dd.c
@@ -16,7 +16,7 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
@@ -24,17 +24,20 @@
vp4dd.c - "Integer Compression" Turbo PforDelta
**/
+#include
#include "conf.h"
#include "bitunpack.h"
#include "vp4dd.h"
-
+
#define PAD8(__x) ( (((__x)+8-1)/8) )
-#include
+
#define USIZE 32
#include "vp4dd_.h"
+#undef USIZE
-//#define USIZE 16
-//#include "vp4dd_.h"
+#define USIZE 16
+#include "vp4dd_.h"
+#undef USIZE
diff --git a/vp4dd.h b/vp4dd.h
index 71af111..fcb740c 100644
--- a/vp4dd.h
+++ b/vp4dd.h
@@ -16,33 +16,43 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
vp4dd.h - "Integer Compression" Turbo PforDelta
**/
-unsigned char *p4ddec32( unsigned char *__restrict__ in, int n, unsigned *__restrict__ out);
-unsigned char *p4ddecx32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out);
-//-----------------------------------------------------------------------
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define P4DSIZE 128 //64 //
+#define P4DN (P4DSIZE/64)
+
+//---------------- Bulk decompress of TurboPFor compressed integer array -------------------------------------------------------
+// decompress a previously (with p4denc32) 32 bits packed array. Return value = end of packed buffer in
+unsigned char *p4ddec32( unsigned char *__restrict in, int n, unsigned *__restrict out);
+
+//---------------- Direct Access functions to compressed TurboPFor array -------------------------------------------------------
#define P4D_PAD8(__x) ( (((__x)+8-1)/8) )
#define P4D_XB(__x) ((__x & 1)?(__x >> 8):0)
#define P4D_B(__x) ((__x >> 1) & 0x3f)
-#define P4D_ININC(__in, __x) __in += 1+(__x & 1)
+#define P4D_ININC(__in, __x) __in += 1+(__x & 1)
-static inline unsigned vp4dbits(unsigned char *__restrict__ in, int *xb) { unsigned i = *(unsigned short *)in; *xb = P4D_XB(i); return P4D_B(i); }
+static inline unsigned vp4dbits(unsigned char *__restrict in, int *xb) { unsigned i = *(unsigned short *)in; *xb = P4D_XB(i); return P4D_B(i); }
struct p4d {
unsigned long long *xmap;
unsigned char *ex;
- unsigned i,xb,cum[2];
+ unsigned i,xb,cum[P4DN+1];
int oval,idx;
};
-static inline void p4dini(struct p4d *p4d, unsigned char **__restrict__ pin, int n, unsigned *b) { unsigned char *in = *pin;
- static unsigned long long xmap[2] = { 0 };
+// prepare direct access usage
+static inline void p4dini(struct p4d *p4d, unsigned char *__restrict *pin, int n, unsigned *b) { unsigned char *in = *pin;
+ static unsigned long long xmap[P4DN+1] = { 0 };
unsigned i = *(unsigned short *)in;
p4d->i = i;
@@ -52,22 +62,33 @@ static inline void p4dini(struct p4d *p4d, unsigned char **__restrict__ pin, int
*pin = in;
p4d->ex = in + P4D_PAD8(n*(*b));
- p4d->xmap = (i&1)?p4d->ex:xmap;
- p4d->ex += (i&1)?16:0;
+ p4d->xmap = (i&1)?(unsigned long long *)p4d->ex:xmap;
+ p4d->ex += (i&1)?8*P4DN:0;
p4d->cum[0] = 0;
- p4d->cum[1] = popcnt64(p4d->xmap[0]);
+ for(i=1; i < P4DN; i++) p4d->cum[i] = p4d->cum[i-1] + popcnt64(p4d->xmap[i-1]);
p4d->oval = p4d->idx = -1;
}
-static ALWAYS_INLINE unsigned vp4dget32(struct p4d p4d, unsigned char *__restrict__ in, unsigned b, unsigned idx) { unsigned bi, cl, u = _bitgetx32(in, b, idx*b);
- if(unlikely(p4d.xmap[bi = idx>>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = (idx & 0x3f))))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<oval += vp4dget(*p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; }
+// Get the next single value greater of equal to val
+static ALWAYS_INLINE int vp4dgeq(struct p4d *p4d, unsigned char *__restrict in, unsigned b, int val) { do p4d->oval += vp4dget(*p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; }
+/* like p4ddec32 but using direct access. This is only a demo showing direct access usage. Use p4ddec32 for instead for decompressing entire blocks */
+unsigned char *p4ddecx32(unsigned char *__restrict in, int n, unsigned *__restrict out);
+unsigned char *p4dfdecx32(unsigned char *__restrict in, int n, unsigned start, unsigned *__restrict out);
+unsigned char *p4df0decx32(unsigned char *__restrict in, int n, unsigned start, unsigned *__restrict out);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/vp4dd_.h b/vp4dd_.h
index f92ce5f..720e533 100644
--- a/vp4dd_.h
+++ b/vp4dd_.h
@@ -16,14 +16,15 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
vp4dd_.h - "Integer Compression" Turbo PforDelta
**/
- #ifdef __AVX2__
+
+ #ifdef USE__AVX2__ // disabled per default.
#include
static ALIGNED(unsigned char, shuffles[256][8], 32) = {
@@ -291,23 +292,43 @@ static ALIGNED(unsigned char, shuffles[256][8], 32) = {
#define uint_t TEMPLATE3(uint, USIZE, _t)
-unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict__ in, int n, uint_t *__restrict__ out) {
- uint_t ex[0x100+8]; unsigned i = *(unsigned short *)in; uint_t b = P4D_B(i); unsigned xb = P4D_XB(i);
+unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict in, int n, uint_t *__restrict out) {
+ uint_t ex[0x100+8];
+ unsigned i = *(unsigned short *)in;
+ uint_t b = P4D_B(i);
+ unsigned xb = P4D_XB(i);
P4D_ININC(in,i);
- in = TEMPLATE2(bitunpack, USIZE)(in, n, b, out);
- if(i & 1) {
- unsigned long long b0 = *(unsigned long long *)in; in += 8; unsigned long long b1 = *(unsigned long long *)in; in += 8;
- in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(b0) + popcnt64(b1), xb, ex);
- #ifdef __AVX2__
- unsigned *op,*pex = ex;
- for(op = out; b0; b0 >>= 8,op += 8) { const unsigned m = (unsigned char)b0, mc=popcnt32(m), s = pex[mc]; pex[mc]=0;
- _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s;
- }
- for(op = out+64; b1; b1 >>= 8,op += 8) { const unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0;
- _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s;
- }
- #elif defined(__SSE4_1__)
+ in = TEMPLATE2(bitunpack, USIZE)(in, n, b, out);
+ if(i & 1) {
+ #if P4DN == 2
+ unsigned long long bb[P4DN]; unsigned num=0;
+ bb[0] = *(unsigned long long *)in; in += 8;
+ bb[1] = *(unsigned long long *)in; in += 8;
+ in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(bb[0]) + popcnt64(bb[1]), xb, ex);
+ #else
+ unsigned long long bb[P4DN]; unsigned num=0;
+ for(i = 0; i < P4DN; i++) { bb[i] = *(unsigned long long *)in; in += 8; num += popcnt64(bb[i]); }
+ in = TEMPLATE2(bitunpack, USIZE)(in, num, xb, ex);
+ #endif
+
+ #if 0 //def __AVX2__
+ uint_t *op,*pex = ex;
+ #if 0 //P4DN == 2
+ for(op = out; b0; b0 >>= 8,op += 8) { unsigned m = (unsigned char)b0, mc=popcnt32(m), s = pex[mc]; pex[mc]=0;
+ _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s;
+ }
+ for(op = out+64; b1; b1 >>= 8,op += 8) { unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0;
+ _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s;
+ }
+ #else
+ for(i = 0; i < P4DN; i++) {
+ for(op = out; bb[i]; bb[i] >>= 8,op += 8) { unsigned m = (unsigned char)bb[i], mc=popcnt32(m), s = pex[mc]; pex[mc]=0;
+ _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s;
+ } out += 64;
+ }
+ #endif
+ #elif defined(__SSE4_1__)
static ALIGNED(char, shuffles[16][16], 16) = {
#define _ 0x80
{ _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ },
@@ -328,23 +349,38 @@ unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict__ in, int n, u
{ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 },
#undef _
};
- unsigned *op,*pex = ex;
- for(op = out; b0; b0 >>= 4,op+=4) { const unsigned m = b0&0xf;
+ uint_t *op,*pex = ex;
+
+ #if P4DN == 2
+ for(op = out; bb[0]; bb[0] >>= 4,op+=4) { const unsigned m = bb[0]&0xf;
_mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m);
}
- for(op=out+64; b1; b1 >>= 4,op+=4) { const unsigned m = b1&0xf;
+ for(op=out+64; bb[1]; bb[1] >>= 4,op+=4) { const unsigned m = bb[1]&0xf;
_mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m);
}
+ #else
+ for(i = 0; i < P4DN; i++) { // Loop unrolling
+ for(op = out; bb[i]; bb[i] >>= 4,op+=4) { const unsigned m = bb[i]&0xf;
+ _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m);
+ } out+=64;
+ }
+ #endif
#else
unsigned k = 0;
- while(b0) { unsigned x = ctzll(b0); out[x] += ex[k++]<i&1)?(p4d->xmap+2):p4d->in+ PAD8(n*xb);
+unsigned char *TEMPLATE2(p4dfdecx, USIZE)(unsigned char *__restrict in, int n, unsigned start, uint_t *__restrict out) {
+ unsigned b,i;
+ struct p4d p4d;
+ p4dini(&p4d, &in, n, &b);
+
+ if(unlikely(p4d.i & 1)) {
+ for(i = 0; i < n; i++)
+ out[i] = TEMPLATE2(vp4dget, USIZE)(p4d, in, b, i)+start+i+1;
+ return p4d.ex + PAD8((p4d.cum[P4DN-1] + popcnt64(p4d.xmap[P4DN-1]))*p4d.xb);
+ } else {
+ for(i = 0; i < n; i++) out[i] = TEMPLATE2(_bitgetx, USIZE)(in, b, i*b)+start+i+1;
+ return p4d.ex;
+ }
}
- #endif
+
+unsigned char *TEMPLATE2(p4df0decx, USIZE)(unsigned char *__restrict in, int n, unsigned start, uint_t *__restrict out) {
+ unsigned b,i;
+ struct p4d p4d;
+ p4dini(&p4d, &in, n, &b);
+
+ if(unlikely(p4d.i & 1)) {
+ for(i = 0; i < n; i++)
+ out[i] = TEMPLATE2(vp4dget, USIZE)(p4d, in, b, i)+start;
+ return p4d.ex + PAD8((p4d.cum[P4DN-1] + popcnt64(p4d.xmap[P4DN-1]))*p4d.xb);
+ } else {
+ for(i = 0; i < n; i++) out[i] = TEMPLATE2(_bitgetx, USIZE)(in, b, i*b)+start;
+ return p4d.ex;
+ }
+}
+
+
diff --git a/vsimple.c b/vsimple.c
index f8bff77..d0709dc 100644
--- a/vsimple.c
+++ b/vsimple.c
@@ -16,7 +16,7 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
@@ -25,18 +25,21 @@
**/
#include "vsimple.h"
-
+
#define USE_RLE
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
#define SV_LIM unsigned char s_lim[] = { 0, 28, 28, 28, 28, 36, 36, 36, 36, 36, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 0 };
-#define SV_ITM unsigned s_itm[] = { -1, 28, 14, 9, 7, 7, 6, 5, 4, 4, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, -1 }
+#define SV_ITM unsigned s_itm[] = { 0, 28, 14, 9, 7, 7, 6, 5, 4, 4, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, -1 }
static SV_ITM;
static SV_LIM;
#include
#define USIZE 32
#include "vsimple_.h"
+#undef USIZE
#define USIZE 16
#include "vsimple_.h"
+#undef USIZE
+
diff --git a/vsimple.h b/vsimple.h
index b1684f4..4eeb26f 100644
--- a/vsimple.h
+++ b/vsimple.h
@@ -16,27 +16,33 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
- vsimple.h - "Integer Compression" variable simple
+ vsimple.h - "Integer Compression" variable simple "SimpleV"
+ this belongs to the integer compression known as "simple family", like simple-9,simple-16
+ or simple-8b. SimpleV is compressing integers in groups into variable word size 32, 40 and 64 bits + RLE (run length encoding)
+ SimpleV is faster than simple-16 and compress better than simple-16 or simple-8b.
**/
#ifdef __cplusplus
extern "C" {
#endif
-unsigned char *vsenc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out);
-unsigned char *vsdec32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out);
+// compress array with n unsigned (32 bits in[n]) values to the buffer out. Return value = end of compressed buffer out
+unsigned char *vsenc32(unsigned *__restrict in, int n, unsigned char *__restrict out);
-unsigned char *vsenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out);
-unsigned char *vsdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out);
+// decompress buffer into an array of n unsigned values. Return value = end of decompressed buffer in
+unsigned char *vsdec32(unsigned char *__restrict in, int n, unsigned *__restrict out);
+
+// like vsenc32 but for 16 bits values
+unsigned char *vsenc16(unsigned short *__restrict in, int n, unsigned char *__restrict out);
+
+// like vsdec32 but for 16 bits values
+unsigned char *vsdec16(unsigned char *__restrict in, int n, unsigned short *__restrict out);
#ifdef __cplusplus
}
#endif
-
-
-
diff --git a/vsimple_.h b/vsimple_.h
index 59f1dbe..891efda 100644
--- a/vsimple_.h
+++ b/vsimple_.h
@@ -16,29 +16,38 @@
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- - email : powturbo@gmail.com
+ - email : powturbo [AT] gmail.com
- github : https://github.com/powturbo
- homepage : https://sites.google.com/site/powturbo/
- twitter : https://twitter.com/powturbo
vsimple_.h - "Integer Compression" variable simple
**/
-
+#include
+#include
+#include "conf.h"
#include "vint.h"
#define uint_t TEMPLATE3(uint, USIZE, _t)
-unsigned char *TEMPLATE2(vsenc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ op) {
- unsigned xm,m,r;
+unsigned char *TEMPLATE2(vsenc, USIZE)(uint_t *__restrict in, int n, unsigned char *__restrict op) {
+ unsigned xm,m,r,x;
uint_t *e = in+n,*ip;
for(ip = in; ip < e; ) {
#ifdef USE_RLE
- if(ip < e-4 && *ip == *(ip+1)) { uint_t *q = ip+1; while(q < e-1 && *(q+1) == *ip) q++; r = q - ip;
- if(r*TEMPLATE2(bsr, USIZE)(*ip) > 16 || !*ip && r>4) { m = (*ip)?33:0; goto a; }
+ if(ip+4 < e && *ip == *(ip+1)) {
+ uint_t *q = ip+1;
+ while(q+1 < e && *(q+1) == *ip) q++;
+ r = q - ip;
+ if(r*TEMPLATE2(bsr, USIZE)(*ip) > 16 || !*ip && r>4) {
+ m = (*ip)?33:0;
+ goto a;
+ }
} else
#endif
- r = 0; unsigned x = m = bsr32(*ip);
- while((r+1)*(xm = x > m?x:m) <= s_lim[xm]) { m = xm; x = TEMPLATE2(bsr, USIZE)(*(ip+(++r))); }
- if(/*xm != 32 &&*/ m) while(r < s_itm[m]) m++;
+ r = 0;
+ for(m = x = TEMPLATE2(bsr, USIZE)(*ip);(r+1)*(xm = x > m?x:m) <= s_lim[xm] && ip+r>4)&0xf; ip++;
+ unsigned r = (w>>4)&0xf; ip++;
if(unlikely(r == 0xf)) {
if(n <= 0x100)
r = (w>>8)&0xff, ip++;
@@ -247,7 +258,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
} break;
case 1:
OP( 0) = (w >> 4) & 1;
- OP( 1) = (w >> 5) & 1;
+ OP( 1) = (w >> 5) & 1;
OP( 2) = (w >> 6) & 1;
OP( 3) = (w >> 7) & 1;
OP( 4) = (w >> 8) & 1;
@@ -273,7 +284,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
OP(24) = (w >> 28) & 1;
OP(25) = (w >> 29) & 1;
OP(26) = (w >> 30) & 1;
- OP(27) = (w >> 31) & 1; OPI( 28); ip+=4;
+ OP(27) = (w >> 31) & 1; OPI( 28); ip+=4;
break;
case 2:
OP( 0) = (w >> 4) & 3;
@@ -289,7 +300,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
OP(10) = (w >> 24) & 3;
OP(11) = (w >> 26) & 3;
OP(12) = (w >> 28) & 3;
- OP(13) = (w >> 30) & 3; OPI( 14); ip+=4;
+ OP(13) = (w >> 30) & 3; OPI( 14); ip+=4;
break;
case 3:
OP( 0) = (w >> 4) & 7;
@@ -300,7 +311,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
OP( 5) = (w >> 19) & 7;
OP( 6) = (w >> 22) & 7;
OP( 7) = (w >> 25) & 7;
- OP( 8) = (w >> 28) & 7; OPI( 9); ip+=4;
+ OP( 8) = (w >> 28) & 7; OPI( 9); ip+=4;
break;
case 4:
OP( 0) = (w >> 4) & 0xf;
@@ -326,7 +337,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
OP(2) = (w >> 16) & 0x3f;
OP(3) = (w >> 22) & 0x3f;
OP(4) = (w >> 28) & 0x3f;
- OP(5) = (w >> 34) & 0x3f; OPI( 6); ip+=5;
+ OP(5) = (w >> 34) & 0x3f; OPI( 6); ip+=5;
break;
case 7:
@@ -353,7 +364,7 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
OP(0) = (w >> 4) & 0x1ff;
OP(1) = (w >> 13) & 0x1ff;
OP(2) = (w >> 22) & 0x1ff;
- OP(3) = (w >> 31) & 0x1ff; OPI( 4); ip+=5;
+ OP(3) = (w >> 31) & 0x1ff; OPI( 4); ip+=5;
break;
case 10:
@@ -362,32 +373,32 @@ unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, ui
OP(2) = (w >> 24) & 0x3ff;
OP(3) = (w >> 34) & 0x3ff;
OP(4) = (w >> 44) & 0x3ff;
- OP(5) = (w >> 54) & 0x3ff; OPI( 6); ip+=8;
+ OP(5) = (w >> 54) & 0x3ff; OPI( 6); ip+=8;
break;
case 12:
- OP(0) = (w >> 4) & 0xfff;
- OP(1) = (w >> 16) & 0xfff;
- OP(2) = (w >> 28) & 0xfff;
- OP(3) = (w >> 40) & 0xfff;
- OP(4) = (w >> 52) & 0xfff; OPI( 5); ip+=8;
+ OP(0) = (w >> 4) & 0xfffu;
+ OP(1) = (w >> 16) & 0xfffu;
+ OP(2) = (w >> 28) & 0xfffu;
+ OP(3) = (w >> 40) & 0xfffu;
+ OP(4) = (w >> 52) & 0xfffu; OPI( 5); ip+=8;
break;
case 15:
- OP(0) = (w >> 4) & 0x7fff;
- OP(1) = (w >> 19) & 0x7fff;
- OP(2) = (w >> 34) & 0x7fff;
- OP(3) = (w >> 49) & 0x7fff; OPI( 4); ip+=8;
+ OP(0) = (w >> 4) & 0x7fffu;
+ OP(1) = (w >> 19) & 0x7fffu;
+ OP(2) = (w >> 34) & 0x7fffu;
+ OP(3) = (w >> 49) & 0x7fffu; OPI( 4); ip+=8;
break;
case 11:
- OP(0) = (w >> 4) & 0xfffff; // 20
- OP(1) = (w >> 24) & 0xfffff;
- OP(2) = (w >> 44) & 0xfffff; OPI( 3); ip+=8;
+ OP(0) = (w >> 4) & 0xfffffu; // 20
+ OP(1) = (w >> 24) & 0xfffffu;
+ OP(2) = (w >> 44) & 0xfffffu; OPI( 3); ip+=8;
break;
case 13:
- OP(0) = (w >> 4) & ((1<<30)-1);
- OP(1) = (w >> 34) & ((1<<30)-1); OPI( 2); ip+=8;
- break;
+ OP(0) = (w >> 4) & 0x3fffffffu;
+ OP(1) = (w >> 34) & 0x3fffffffu; OPI( 2); ip+=8;
+ break;
case 14:
- OP(0) = (w >> 4) & ((1ull<<32)-1); OPI( 1); ip+=5;
+ OP(0) = (w >> 4) & 0xffffffffu; OPI( 1); ip+=5;
break;
}
}