From b1bd8dcf23ee10f052af44a8aea466914497a61d Mon Sep 17 00:00:00 2001 From: powturbo Date: Tue, 28 Oct 2014 22:19:48 +0100 Subject: [PATCH] Initial commit --- README.md | 69 + aux/OPT_PFD/main.cpp | 101 + aux/OPT_PFD/opt_p4.h | 54 + aux/OPT_PFD/pf.h | 158 + aux/OPT_PFD/s16head.h | 251 + aux/OPT_PFD/unpack.h | 773 + aux/simdcomp/bitpacka.c | 17773 +++++++++++ aux/simdcomp/bitpacka.h | 28 + aux/simdcomp/bitpacka.o | Bin 0 -> 116792 bytes aux/simdcomp/example.c | 66 + aux/simdcomp/include/simdbitpacking.h | 21 + aux/simdcomp/include/simdcomp.h | 12 + aux/simdcomp/include/simdcomputil.h | 29 + .../include/simdintegratedbitpacking.h | 27 + aux/simdcomp/makefile | 54 + aux/simdcomp/src/simdbitpacking.c | 14008 +++++++++ aux/simdcomp/src/simdbitpacking.o | Bin 0 -> 74408 bytes aux/simdcomp/src/simdcomputil.c | 56 + aux/simdcomp/src/simdcomputil.o | Bin 0 -> 2416 bytes aux/simdcomp/src/simdintegratedbitpacking.c | 24863 ++++++++++++++++ aux/simdcomp/src/simdintegratedbitpacking.o | Bin 0 -> 152584 bytes aux/simdcomp/src/unit.c | 63 + aux/simple8b.c | 333 + aux/simple8b.h | 2 + aux/simple8b.o | Bin 0 -> 8840 bytes aux/vabyte.h | 99 + aux/varintg8iu.c | 181 + aux/varintg8iu.h | 5 + aux/vas16c.h | 35 + aux/vas16d.h | 402 + aux/vbyte_poly.h | 46 + bitpack.c | 34 + bitpack.h | 30 + bitpack64_.h | 1136 + bitpack_.h | 200 + bitunpack.c | 56 + bitunpack.h | 51 + bitunpack64_.h | 1365 + bitunpack_.h | 112 + conf.h | 70 + icbench.c | 617 + makefile | 28 + vint.h | 70 + vp4dc.c | 41 + vp4dc.h | 27 + vp4dc_.h | 62 + vp4dd.c | 40 + vp4dd.h | 73 + vp4dd_.h | 369 + vsimple.c | 42 + vsimple.h | 42 + vsimple_.h | 396 + 52 files changed, 64370 insertions(+) create mode 100644 README.md create mode 100644 aux/OPT_PFD/main.cpp create mode 100644 aux/OPT_PFD/opt_p4.h create mode 100644 aux/OPT_PFD/pf.h create mode 100644 aux/OPT_PFD/s16head.h create mode 100644 aux/OPT_PFD/unpack.h create mode 100644 aux/simdcomp/bitpacka.c create mode 100644 aux/simdcomp/bitpacka.h create mode 100644 aux/simdcomp/bitpacka.o create mode 100644 aux/simdcomp/example.c create mode 100644 aux/simdcomp/include/simdbitpacking.h create mode 100644 aux/simdcomp/include/simdcomp.h create mode 100644 aux/simdcomp/include/simdcomputil.h create mode 100644 aux/simdcomp/include/simdintegratedbitpacking.h create mode 100644 aux/simdcomp/makefile create mode 100644 aux/simdcomp/src/simdbitpacking.c create mode 100644 aux/simdcomp/src/simdbitpacking.o create mode 100644 aux/simdcomp/src/simdcomputil.c create mode 100644 aux/simdcomp/src/simdcomputil.o create mode 100644 aux/simdcomp/src/simdintegratedbitpacking.c create mode 100644 aux/simdcomp/src/simdintegratedbitpacking.o create mode 100644 aux/simdcomp/src/unit.c create mode 100644 aux/simple8b.c create mode 100644 aux/simple8b.h create mode 100644 aux/simple8b.o create mode 100644 aux/vabyte.h create mode 100644 aux/varintg8iu.c create mode 100644 aux/varintg8iu.h create mode 100644 aux/vas16c.h create mode 100644 aux/vas16d.h create mode 100644 aux/vbyte_poly.h create mode 100644 bitpack.c create mode 100644 bitpack.h create mode 100644 bitpack64_.h create mode 100644 bitpack_.h create mode 100644 bitunpack.c create mode 100644 bitunpack.h create mode 100644 bitunpack64_.h create mode 100644 bitunpack_.h create mode 100644 conf.h create mode 100644 icbench.c create mode 100644 makefile create mode 100644 vint.h create mode 100644 vp4dc.c create mode 100644 vp4dc.h create mode 100644 vp4dc_.h create mode 100644 vp4dd.c create mode 100644 vp4dd.h create mode 100644 vp4dd_.h create mode 100644 vsimple.c create mode 100644 vsimple.h create mode 100644 vsimple_.h diff --git a/README.md b/README.md new file mode 100644 index 0000000..c92045a --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +TurboPFor: Fastest Integer Compression +====================================== + +- 100% C, without inline assembly +

+- Fastest **"Variable Byte"** implementation +

+- Novel **"Variable Simple"** faster than simple16 and more compact than simple64 +

+- Scalar **"Binary Packing"** with bulk decoding as fast as SIMD FastPFor in realistic (No "pure cache") scenarios +- Binary Packing with **Direct/Random Access** without decompressing entire blocks +- Access any single binary packed entry with **zero decompression** +

+- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding +

+- Several times faster than other libraries +- Usage as easy as memcpy +- Instant access to compressed *frequency* and *position* data in inverted index with zero decoding + +# Benchmark: +i7-2600k at 3.4GHz, gcc 4.9, ubuntu 14.10. +- Single thread +- Realistic and practical benchmark with large integer arrays. +- No PURE cache benchmark + +#### Synthetic data: +coming soon! + +#### data files + - clueweb09.sorted from FastPFor (http://lemire.me/data/integercompression2014.html)
+ ./icbench -n10000000000 clueweb09.sorted + + + + + + + + + + + + + +
SizeRatio in %Bits/IntegerC Time MB/sD Time MB/sFunction
5144384058.162.61357.221286.42TurboPFor
5144384058.162.61358.09309.70TurboPFor DA
5398417928.562.746.47767.35OptP4
5831841129.252.96132.42914.89Simple16
6235485659.893.17235.32925.71SimpleV
73336595211.643.72162.211312.15Simple64
86246428913.684.381274.011980.55TurboPack
86246428913.684.381285.28868.06TurboPack DA
86246539113.684.381402.122075.15SIMD-BitPack FPF
6303089028100.0032.001257.501308.22copy
+ +## Compile: + make + +## Usage +###### Synthetic data: + 1. test all functions + ./icbench -a1.0 -m0 -x8 -n100000000 + + - zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution) + - number of integers = 100000000 + - integer range from 0 to 255 (integer size = 0 to 8 bits) + + 2. individual function test (ex. copy TurboPack TurboPack Direct access) + ./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopack,da -n100000000 + +###### Data files: + - Data file Benchmark (file format as in FastPFOR) + ./icbench -n10000000000 clueweb09.sorted + +## Reference: + - "SIMD-BitPack FPF" from FastPFor https://github.com/lemire/simdcomp + - OptP4 and Simple-16 from http://jinruhe.com/ + diff --git a/aux/OPT_PFD/main.cpp b/aux/OPT_PFD/main.cpp new file mode 100644 index 0000000..2c0ec06 --- /dev/null +++ b/aux/OPT_PFD/main.cpp @@ -0,0 +1,101 @@ +/* + * test for OPT-pfd + * + * Author: sding + * + * + */ + + + +#include +#include +#include + +#include "opt_p4.h" + +using namespace std; + +char PATH[128] = "/usr/home/shuai/dumplist/wordlist_Excite"; // for reading list + +int get_list(char *term, unsigned int *doc_id, unsigned int *freq, unsigned int *maxc) +{ + char fpath[128]; + sprintf(fpath,"%s/%s",PATH,term); + FILE *fdd = fopen(fpath,"r"); + if(fdd==NULL) return 0; + + int nread, npos; + + nread = fread(&npos, sizeof(unsigned), 1, fdd); + npos = 0; + + while (nread > 0) + { + nread = fread(&doc_id[npos], sizeof(unsigned), 1, fdd); + if (nread <= 0) break; + fread(&freq[npos], sizeof(unsigned), 1, fdd); + npos++; + } + fclose(fdd); + + int i; + + /* fill out the max values */ + for (i = 0; i < npos; i += BS) + maxc[(i/BS)] = doc_id[i+BS-1]; + + /* take the gap for doc_id */ + for (i = npos-1; i > 0; i--) + { + doc_id[i] -= doc_id[i-1]; + doc_id[i] --; + } + + for (i = 0; i < npos; i++) + freq[i]--; + return npos; +} + +int main() // just for testing +{ + int MAX_NDOC = 25205179; + unsigned int *docid = new unsigned int[MAX_NDOC]; + unsigned int *docid_check = new unsigned int[MAX_NDOC ]; + + unsigned int *fre = new unsigned int[MAX_NDOC]; + unsigned int *maxc = new unsigned int[MAX_NDOC/BS]; + unsigned int *aux = new unsigned int[MAX_NDOC]; + unsigned int * all_array = new unsigned int[2048]; // extra array for coding + + + int listSize = get_list("information", docid, fre, maxc); + cout<<"list size is "< size * 4) // int bytes + { + chunk_size = size *4; + b = l; + temp_en = ex_n; + } + } + + csize += chunk_size; + //printf("encode:%u\n", b); + p4_encode(doc_id + j, BS, b, aux + offset, &size, &ex_n); + offset += size; + } + + return csize; +} diff --git a/aux/OPT_PFD/pf.h b/aux/OPT_PFD/pf.h new file mode 100644 index 0000000..788f8cc --- /dev/null +++ b/aux/OPT_PFD/pf.h @@ -0,0 +1,158 @@ +#include "s16head.h" +#include "unpack.h" + + +#define BS 128 +#define FRAC 0.10 +#define S 16 +#define PCHUNK 128 + +void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w); + + +int detailed_p4_encode(unsigned int **w, unsigned int* p, int num , int *chunk_size, int * exception_n) +{ + int i, j, t, s; + + unsigned int b = cnum[num]; + int bb_e; + int bb_p; + int p_low; + unsigned int e_n = 0; + int max_p = 0; + int max_e = 0; + + unsigned int* out = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + unsigned int* ex = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + unsigned int* po = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + + unsigned int* tp = NULL; + unsigned int *_pp, *_ww; + + if (b == 32) + { + (*w)[0] = ((b<<10)) + (0); + *w +=1; + for (i = 0; i < PCHUNK ; i++) (*w)[i] = p[i]; + *w += (PCHUNK); + (*chunk_size) = 1 + BS; + + free(out); + free(ex); + free(po); + return 0; + } + + for (i = 0; i < PCHUNK ; i++) + { + if ( p[i] >= (1<> b); + po[(e_n++)] = i; // + } + else + out[i] = p[i]; + } + + if (1) // force to pass every time + { + /*get the gap of position*/ + for(j = e_n-1;j>0;j--) + { + po[j] = po[j] - po[j-1] ; + po[j] --; + } + + s = ((b * PCHUNK)>>5); + tp = (*w); + (*w)[0] = ((num<<10))+e_n; // record b and number of exceptions into this value, in the other version we pick this value out and did not count it + (*w) += 1; + for (i = 0; i < s; i++) (*w)[i] = 0; + pack(out, b, PCHUNK , *w); + *w += s; + + unsigned int *all_array = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*4) ; + for(j=0;j>5; + s = 32 - b - (bp & 31); + if (s >= 0) + w[wp] |= (v[i]<>s); + w[wp+1] = (v[i]<<(32-s)); + } + } +} + +/*modified p4decode */ +unsigned int *detailed_p4_decode(unsigned int *_p, unsigned int *_w, unsigned int * all_array) +{ + + int i, s; + unsigned int x; + int flag = _w[0]; + (_w)++; + + unsigned int *_ww,*_pp; + unsigned int b = ((flag>>10) & 31); + unsigned int e_n = (flag & 1023) ; + + (unpack[b])(_p, _w); + + b = cnum[b]; + _w += ((b * BS)>>5); + unsigned int _k = 0; + unsigned int psum = 0; + if(e_n != 0 ) + { + for (_pp = all_array, _ww = (unsigned int *)(_w); _pp < &(all_array[e_n*2]);) + { + S16_DECODE(_ww, _pp); + } + + _w += (_ww - _w); + psum = all_array[0]; + + for(i=0;i>28; \ + switch(_k) \ + { \ + case 0: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 1; _p++; \ + *_p = (*_w>>8) & 1; _p++; \ + *_p = (*_w>>9) & 1; _p++; \ + *_p = (*_w>>10) & 1; _p++; \ + *_p = (*_w>>11) & 1; _p++; \ + *_p = (*_w>>12) & 1; _p++; \ + *_p = (*_w>>13) & 1; _p++; \ + *_p = (*_w>>14) & 1; _p++; \ + *_p = (*_w>>15) & 1; _p++; \ + *_p = (*_w>>16) & 1; _p++; \ + *_p = (*_w>>17) & 1; _p++; \ + *_p = (*_w>>18) & 1; _p++; \ + *_p = (*_w>>19) & 1; _p++; \ + *_p = (*_w>>20) & 1; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 1: \ + *_p = (*_w) & 3; _p++; \ + *_p = (*_w>>2) & 3; _p++; \ + *_p = (*_w>>4) & 3; _p++; \ + *_p = (*_w>>6) & 3; _p++; \ + *_p = (*_w>>8) & 3; _p++; \ + *_p = (*_w>>10) & 3; _p++; \ + *_p = (*_w>>12) & 3; _p++; \ + *_p = (*_w>>14) & 1; _p++; \ + *_p = (*_w>>15) & 1; _p++; \ + *_p = (*_w>>16) & 1; _p++; \ + *_p = (*_w>>17) & 1; _p++; \ + *_p = (*_w>>18) & 1; _p++; \ + *_p = (*_w>>19) & 1; _p++; \ + *_p = (*_w>>20) & 1; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 2: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 3; _p++; \ + *_p = (*_w>>9) & 3; _p++; \ + *_p = (*_w>>11) & 3; _p++; \ + *_p = (*_w>>13) & 3; _p++; \ + *_p = (*_w>>15) & 3; _p++; \ + *_p = (*_w>>17) & 3; _p++; \ + *_p = (*_w>>19) & 3; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 3: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 1; _p++; \ + *_p = (*_w>>8) & 1; _p++; \ + *_p = (*_w>>9) & 1; _p++; \ + *_p = (*_w>>10) & 1; _p++; \ + *_p = (*_w>>11) & 1; _p++; \ + *_p = (*_w>>12) & 1; _p++; \ + *_p = (*_w>>13) & 1; _p++; \ + *_p = (*_w>>14) & 3; _p++; \ + *_p = (*_w>>16) & 3; _p++; \ + *_p = (*_w>>18) & 3; _p++; \ + *_p = (*_w>>20) & 3; _p++; \ + *_p = (*_w>>22) & 3; _p++; \ + *_p = (*_w>>24) & 3; _p++; \ + *_p = (*_w>>26) & 3; _p++; \ + break; \ + case 4: \ + *_p = (*_w) & 3; _p++; \ + *_p = (*_w>>2) & 3; _p++; \ + *_p = (*_w>>4) & 3; _p++; \ + *_p = (*_w>>6) & 3; _p++; \ + *_p = (*_w>>8) & 3; _p++; \ + *_p = (*_w>>10) & 3; _p++; \ + *_p = (*_w>>12) & 3; _p++; \ + *_p = (*_w>>14) & 3; _p++; \ + *_p = (*_w>>16) & 3; _p++; \ + *_p = (*_w>>18) & 3; _p++; \ + *_p = (*_w>>20) & 3; _p++; \ + *_p = (*_w>>22) & 3; _p++; \ + *_p = (*_w>>24) & 3; _p++; \ + *_p = (*_w>>26) & 3; _p++; \ + break; \ + case 5: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 7; _p++; \ + *_p = (*_w>>7) & 7; _p++; \ + *_p = (*_w>>10) & 7; _p++; \ + *_p = (*_w>>13) & 7; _p++; \ + *_p = (*_w>>16) & 7; _p++; \ + *_p = (*_w>>19) & 7; _p++; \ + *_p = (*_w>>22) & 7; _p++; \ + *_p = (*_w>>25) & 7; _p++; \ + break; \ + case 6: \ + *_p = (*_w) & 7; _p++; \ + *_p = (*_w>>3) & 15; _p++; \ + *_p = (*_w>>7) & 15; _p++; \ + *_p = (*_w>>11) & 15; _p++; \ + *_p = (*_w>>15) & 15; _p++; \ + *_p = (*_w>>19) & 7; _p++; \ + *_p = (*_w>>22) & 7; _p++; \ + *_p = (*_w>>25) & 7; _p++; \ + break; \ + case 7: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 15; _p++; \ + *_p = (*_w>>8) & 15; _p++; \ + *_p = (*_w>>12) & 15; _p++; \ + *_p = (*_w>>16) & 15; _p++; \ + *_p = (*_w>>20) & 15; _p++; \ + *_p = (*_w>>24) & 15; _p++; \ + break; \ + case 8: \ + *_p = (*_w) & 31; _p++; \ + *_p = (*_w>>5) & 31; _p++; \ + *_p = (*_w>>10) & 31; _p++; \ + *_p = (*_w>>15) & 31; _p++; \ + *_p = (*_w>>20) & 15; _p++; \ + *_p = (*_w>>24) & 15; _p++; \ + break; \ + case 9: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 15; _p++; \ + *_p = (*_w>>8) & 31; _p++; \ + *_p = (*_w>>13) & 31; _p++; \ + *_p = (*_w>>18) & 31; _p++; \ + *_p = (*_w>>23) & 31; _p++; \ + break; \ + case 10: \ + *_p = (*_w) & 63; _p++; \ + *_p = (*_w>>6) & 63; _p++; \ + *_p = (*_w>>12) & 63; _p++; \ + *_p = (*_w>>18) & 31; _p++; \ + *_p = (*_w>>23) & 31; _p++; \ + break; \ + case 11: \ + *_p = (*_w) & 31; _p++; \ + *_p = (*_w>>5) & 31; _p++; \ + *_p = (*_w>>10) & 63; _p++; \ + *_p = (*_w>>16) & 63; _p++; \ + *_p = (*_w>>22) & 63; _p++; \ + break; \ + case 12: \ + *_p = (*_w) & 127; _p++; \ + *_p = (*_w>>7) & 127; _p++; \ + *_p = (*_w>>14) & 127; _p++; \ + *_p = (*_w>>21) & 127; _p++; \ + break; \ + case 13: \ + *_p = (*_w) & 1023; _p++; \ + *_p = (*_w>>10) & 511; _p++; \ + *_p = (*_w>>19) & 511; _p++; \ + break; \ + case 14: \ + *_p = (*_w) & 16383; _p++; \ + *_p = (*_w>>14) & 16383; _p++; \ + break; \ + case 15: \ + *_p = (*_w) & ((1<<28)-1); _p++; \ + break; \ + }\ + _w++; \ +} + + + + + diff --git a/aux/OPT_PFD/unpack.h b/aux/OPT_PFD/unpack.h new file mode 100644 index 0000000..fa810e9 --- /dev/null +++ b/aux/OPT_PFD/unpack.h @@ -0,0 +1,773 @@ + +/*************************************************************/ +/* macros for fast unpacking of integers of fixed bit length */ +/*************************************************************/ + +#define BS 128 + +/* supported bit lengths */ +int cnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32}; + +void unpack0(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i++) p[i] = 0; +} + + +void unpack1(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 1) + { + p[0] = (w[0] >> 31); + p[1] = (w[0] >> 30) & 1; + p[2] = (w[0] >> 29) & 1; + p[3] = (w[0] >> 28) & 1; + p[4] = (w[0] >> 27) & 1; + p[5] = (w[0] >> 26) & 1; + p[6] = (w[0] >> 25) & 1; + p[7] = (w[0] >> 24) & 1; + p[8] = (w[0] >> 23) & 1; + p[9] = (w[0] >> 22) & 1; + p[10] = (w[0] >> 21) & 1; + p[11] = (w[0] >> 20) & 1; + p[12] = (w[0] >> 19) & 1; + p[13] = (w[0] >> 18) & 1; + p[14] = (w[0] >> 17) & 1; + p[15] = (w[0] >> 16) & 1; + p[16] = (w[0] >> 15) & 1; + p[17] = (w[0] >> 14) & 1; + p[18] = (w[0] >> 13) & 1; + p[19] = (w[0] >> 12) & 1; + p[20] = (w[0] >> 11) & 1; + p[21] = (w[0] >> 10) & 1; + p[22] = (w[0] >> 9) & 1; + p[23] = (w[0] >> 8) & 1; + p[24] = (w[0] >> 7) & 1; + p[25] = (w[0] >> 6) & 1; + p[26] = (w[0] >> 5) & 1; + p[27] = (w[0] >> 4) & 1; + p[28] = (w[0] >> 3) & 1; + p[29] = (w[0] >> 2) & 1; + p[30] = (w[0] >> 1) & 1; + p[31] = (w[0]) & 1; + } +} + + +void unpack2(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 2) + { + p[0] = (w[0] >> 30); + p[1] = (w[0] >> 28) & 3; + p[2] = (w[0] >> 26) & 3; + p[3] = (w[0] >> 24) & 3; + p[4] = (w[0] >> 22) & 3; + p[5] = (w[0] >> 20) & 3; + p[6] = (w[0] >> 18) & 3; + p[7] = (w[0] >> 16) & 3; + p[8] = (w[0] >> 14) & 3; + p[9] = (w[0] >> 12) & 3; + p[10] = (w[0] >> 10) & 3; + p[11] = (w[0] >> 8) & 3; + p[12] = (w[0] >> 6) & 3; + p[13] = (w[0] >> 4) & 3; + p[14] = (w[0] >> 2) & 3; + p[15] = (w[0]) & 3; + p[16] = (w[1] >> 30); + p[17] = (w[1] >> 28) & 3; + p[18] = (w[1] >> 26) & 3; + p[19] = (w[1] >> 24) & 3; + p[20] = (w[1] >> 22) & 3; + p[21] = (w[1] >> 20) & 3; + p[22] = (w[1] >> 18) & 3; + p[23] = (w[1] >> 16) & 3; + p[24] = (w[1] >> 14) & 3; + p[25] = (w[1] >> 12) & 3; + p[26] = (w[1] >> 10) & 3; + p[27] = (w[1] >> 8) & 3; + p[28] = (w[1] >> 6) & 3; + p[29] = (w[1] >> 4) & 3; + p[30] = (w[1] >> 2) & 3; + p[31] = (w[1]) & 3; + } +} + + +void unpack3(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 3) + { + p[0] = (w[0] >> 29); + p[1] = (w[0] >> 26) & 7; + p[2] = (w[0] >> 23) & 7; + p[3] = (w[0] >> 20) & 7; + p[4] = (w[0] >> 17) & 7; + p[5] = (w[0] >> 14) & 7; + p[6] = (w[0] >> 11) & 7; + p[7] = (w[0] >> 8) & 7; + p[8] = (w[0] >> 5) & 7; + p[9] = (w[0] >> 2) & 7; + p[10] = (w[0] << 1) & 7; + p[10] |= (w[1] >> 31); + p[11] = (w[1] >> 28) & 7; + p[12] = (w[1] >> 25) & 7; + p[13] = (w[1] >> 22) & 7; + p[14] = (w[1] >> 19) & 7; + p[15] = (w[1] >> 16) & 7; + p[16] = (w[1] >> 13) & 7; + p[17] = (w[1] >> 10) & 7; + p[18] = (w[1] >> 7) & 7; + p[19] = (w[1] >> 4) & 7; + p[20] = (w[1] >> 1) & 7; + p[21] = (w[1] << 2) & 7; + p[21] |= (w[2] >> 30); + p[22] = (w[2] >> 27) & 7; + p[23] = (w[2] >> 24) & 7; + p[24] = (w[2] >> 21) & 7; + p[25] = (w[2] >> 18) & 7; + p[26] = (w[2] >> 15) & 7; + p[27] = (w[2] >> 12) & 7; + p[28] = (w[2] >> 9) & 7; + p[29] = (w[2] >> 6) & 7; + p[30] = (w[2] >> 3) & 7; + p[31] = (w[2]) & 7; + } +} + + +void unpack4(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 4) + { + p[0] = (w[0] >> 28); + p[1] = (w[0] >> 24) & 15; + p[2] = (w[0] >> 20) & 15; + p[3] = (w[0] >> 16) & 15; + p[4] = (w[0] >> 12) & 15; + p[5] = (w[0] >> 8) & 15; + p[6] = (w[0] >> 4) & 15; + p[7] = (w[0]) & 15; + p[8] = (w[1] >> 28); + p[9] = (w[1] >> 24) & 15; + p[10] = (w[1] >> 20) & 15; + p[11] = (w[1] >> 16) & 15; + p[12] = (w[1] >> 12) & 15; + p[13] = (w[1] >> 8) & 15; + p[14] = (w[1] >> 4) & 15; + p[15] = (w[1]) & 15; + p[16] = (w[2] >> 28); + p[17] = (w[2] >> 24) & 15; + p[18] = (w[2] >> 20) & 15; + p[19] = (w[2] >> 16) & 15; + p[20] = (w[2] >> 12) & 15; + p[21] = (w[2] >> 8) & 15; + p[22] = (w[2] >> 4) & 15; + p[23] = (w[2]) & 15; + p[24] = (w[3] >> 28); + p[25] = (w[3] >> 24) & 15; + p[26] = (w[3] >> 20) & 15; + p[27] = (w[3] >> 16) & 15; + p[28] = (w[3] >> 12) & 15; + p[29] = (w[3] >> 8) & 15; + p[30] = (w[3] >> 4) & 15; + p[31] = (w[3]) & 15; + } +} + + +void unpack5(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 5) + { + p[0] = (w[0] >> 27); + p[1] = (w[0] >> 22) & 31; + p[2] = (w[0] >> 17) & 31; + p[3] = (w[0] >> 12) & 31; + p[4] = (w[0] >> 7) & 31; + p[5] = (w[0] >> 2) & 31; + p[6] = (w[0] << 3) & 31; + p[6] |= (w[1] >> 29); + p[7] = (w[1] >> 24) & 31; + p[8] = (w[1] >> 19) & 31; + p[9] = (w[1] >> 14) & 31; + p[10] = (w[1] >> 9) & 31; + p[11] = (w[1] >> 4) & 31; + p[12] = (w[1] << 1) & 31; + p[12] |= (w[2] >> 31); + p[13] = (w[2] >> 26) & 31; + p[14] = (w[2] >> 21) & 31; + p[15] = (w[2] >> 16) & 31; + p[16] = (w[2] >> 11) & 31; + p[17] = (w[2] >> 6) & 31; + p[18] = (w[2] >> 1) & 31; + p[19] = (w[2] << 4) & 31; + p[19] |= (w[3] >> 28); + p[20] = (w[3] >> 23) & 31; + p[21] = (w[3] >> 18) & 31; + p[22] = (w[3] >> 13) & 31; + p[23] = (w[3] >> 8) & 31; + p[24] = (w[3] >> 3) & 31; + p[25] = (w[3] << 2) & 31; + p[25] |= (w[4] >> 30); + p[26] = (w[4] >> 25) & 31; + p[27] = (w[4] >> 20) & 31; + p[28] = (w[4] >> 15) & 31; + p[29] = (w[4] >> 10) & 31; + p[30] = (w[4] >> 5) & 31; + p[31] = (w[4]) & 31; + } +} + + +void unpack6(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 6) + { + p[0] = (w[0] >> 26); + p[1] = (w[0] >> 20) & 63; + p[2] = (w[0] >> 14) & 63; + p[3] = (w[0] >> 8) & 63; + p[4] = (w[0] >> 2) & 63; + p[5] = (w[0] << 4) & 63; + p[5] |= (w[1] >> 28); + p[6] = (w[1] >> 22) & 63; + p[7] = (w[1] >> 16) & 63; + p[8] = (w[1] >> 10) & 63; + p[9] = (w[1] >> 4) & 63; + p[10] = (w[1] << 2) & 63; + p[10] |= (w[2] >> 30); + p[11] = (w[2] >> 24) & 63; + p[12] = (w[2] >> 18) & 63; + p[13] = (w[2] >> 12) & 63; + p[14] = (w[2] >> 6) & 63; + p[15] = (w[2]) & 63; + p[16] = (w[3] >> 26); + p[17] = (w[3] >> 20) & 63; + p[18] = (w[3] >> 14) & 63; + p[19] = (w[3] >> 8) & 63; + p[20] = (w[3] >> 2) & 63; + p[21] = (w[3] << 4) & 63; + p[21] |= (w[4] >> 28); + p[22] = (w[4] >> 22) & 63; + p[23] = (w[4] >> 16) & 63; + p[24] = (w[4] >> 10) & 63; + p[25] = (w[4] >> 4) & 63; + p[26] = (w[4] << 2) & 63; + p[26] |= (w[5] >> 30); + p[27] = (w[5] >> 24) & 63; + p[28] = (w[5] >> 18) & 63; + p[29] = (w[5] >> 12) & 63; + p[30] = (w[5] >> 6) & 63; + p[31] = (w[5]) & 63; + } +} + + +void unpack7(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 7) + { + p[0] = (w[0] >> 25); + p[1] = (w[0] >> 18) & 127; + p[2] = (w[0] >> 11) & 127; + p[3] = (w[0] >> 4) & 127; + p[4] = (w[0] << 3) & 127; + p[4] |= (w[1] >> 29); + p[5] = (w[1] >> 22) & 127; + p[6] = (w[1] >> 15) & 127; + p[7] = (w[1] >> 8) & 127; + p[8] = (w[1] >> 1) & 127; + p[9] = (w[1] << 6) & 127; + p[9] |= (w[2] >> 26); + p[10] = (w[2] >> 19) & 127; + p[11] = (w[2] >> 12) & 127; + p[12] = (w[2] >> 5) & 127; + p[13] = (w[2] << 2) & 127; + p[13] |= (w[3] >> 30); + p[14] = (w[3] >> 23) & 127; + p[15] = (w[3] >> 16) & 127; + p[16] = (w[3] >> 9) & 127; + p[17] = (w[3] >> 2) & 127; + p[18] = (w[3] << 5) & 127; + p[18] |= (w[4] >> 27); + p[19] = (w[4] >> 20) & 127; + p[20] = (w[4] >> 13) & 127; + p[21] = (w[4] >> 6) & 127; + p[22] = (w[4] << 1) & 127; + p[22] |= (w[5] >> 31); + p[23] = (w[5] >> 24) & 127; + p[24] = (w[5] >> 17) & 127; + p[25] = (w[5] >> 10) & 127; + p[26] = (w[5] >> 3) & 127; + p[27] = (w[5] << 4) & 127; + p[27] |= (w[6] >> 28); + p[28] = (w[6] >> 21) & 127; + p[29] = (w[6] >> 14) & 127; + p[30] = (w[6] >> 7) & 127; + p[31] = (w[6]) & 127; + } +} + + +void unpack8(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 8) + { + p[0] = (w[0] >> 24); + p[1] = (w[0] >> 16) & 255; + p[2] = (w[0] >> 8) & 255; + p[3] = (w[0]) & 255; + p[4] = (w[1] >> 24); + p[5] = (w[1] >> 16) & 255; + p[6] = (w[1] >> 8) & 255; + p[7] = (w[1]) & 255; + p[8] = (w[2] >> 24); + p[9] = (w[2] >> 16) & 255; + p[10] = (w[2] >> 8) & 255; + p[11] = (w[2]) & 255; + p[12] = (w[3] >> 24); + p[13] = (w[3] >> 16) & 255; + p[14] = (w[3] >> 8) & 255; + p[15] = (w[3]) & 255; + p[16] = (w[4] >> 24); + p[17] = (w[4] >> 16) & 255; + p[18] = (w[4] >> 8) & 255; + p[19] = (w[4]) & 255; + p[20] = (w[5] >> 24); + p[21] = (w[5] >> 16) & 255; + p[22] = (w[5] >> 8) & 255; + p[23] = (w[5]) & 255; + p[24] = (w[6] >> 24); + p[25] = (w[6] >> 16) & 255; + p[26] = (w[6] >> 8) & 255; + p[27] = (w[6]) & 255; + p[28] = (w[7] >> 24); + p[29] = (w[7] >> 16) & 255; + p[30] = (w[7] >> 8) & 255; + p[31] = (w[7]) & 255; + } +} + + +void unpack9(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 9) + { + p[0] = (w[0] >> 23); + p[1] = (w[0] >> 14) & 511; + p[2] = (w[0] >> 5) & 511; + p[3] = (w[0] << 4) & 511; + p[3] |= (w[1] >> 28); + p[4] = (w[1] >> 19) & 511; + p[5] = (w[1] >> 10) & 511; + p[6] = (w[1] >> 1) & 511; + p[7] = (w[1] << 8) & 511; + p[7] |= (w[2] >> 24); + p[8] = (w[2] >> 15) & 511; + p[9] = (w[2] >> 6) & 511; + p[10] = (w[2] << 3) & 511; + p[10] |= (w[3] >> 29); + p[11] = (w[3] >> 20) & 511; + p[12] = (w[3] >> 11) & 511; + p[13] = (w[3] >> 2) & 511; + p[14] = (w[3] << 7) & 511; + p[14] |= (w[4] >> 25); + p[15] = (w[4] >> 16) & 511; + p[16] = (w[4] >> 7) & 511; + p[17] = (w[4] << 2) & 511; + p[17] |= (w[5] >> 30); + p[18] = (w[5] >> 21) & 511; + p[19] = (w[5] >> 12) & 511; + p[20] = (w[5] >> 3) & 511; + p[21] = (w[5] << 6) & 511; + p[21] |= (w[6] >> 26); + p[22] = (w[6] >> 17) & 511; + p[23] = (w[6] >> 8) & 511; + p[24] = (w[6] << 1) & 511; + p[24] |= (w[7] >> 31); + p[25] = (w[7] >> 22) & 511; + p[26] = (w[7] >> 13) & 511; + p[27] = (w[7] >> 4) & 511; + p[28] = (w[7] << 5) & 511; + p[28] |= (w[8] >> 27); + p[29] = (w[8] >> 18) & 511; + p[30] = (w[8] >> 9) & 511; + p[31] = (w[8]) & 511; + } +} + + +void unpack10(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 10) + { + p[0] = (w[0] >> 22); + p[1] = (w[0] >> 12) & 1023; + p[2] = (w[0] >> 2) & 1023; + p[3] = (w[0] << 8) & 1023; + p[3] |= (w[1] >> 24); + p[4] = (w[1] >> 14) & 1023; + p[5] = (w[1] >> 4) & 1023; + p[6] = (w[1] << 6) & 1023; + p[6] |= (w[2] >> 26); + p[7] = (w[2] >> 16) & 1023; + p[8] = (w[2] >> 6) & 1023; + p[9] = (w[2] << 4) & 1023; + p[9] |= (w[3] >> 28); + p[10] = (w[3] >> 18) & 1023; + p[11] = (w[3] >> 8) & 1023; + p[12] = (w[3] << 2) & 1023; + p[12] |= (w[4] >> 30); + p[13] = (w[4] >> 20) & 1023; + p[14] = (w[4] >> 10) & 1023; + p[15] = (w[4]) & 1023; + p[16] = (w[5] >> 22); + p[17] = (w[5] >> 12) & 1023; + p[18] = (w[5] >> 2) & 1023; + p[19] = (w[5] << 8) & 1023; + p[19] |= (w[6] >> 24); + p[20] = (w[6] >> 14) & 1023; + p[21] = (w[6] >> 4) & 1023; + p[22] = (w[6] << 6) & 1023; + p[22] |= (w[7] >> 26); + p[23] = (w[7] >> 16) & 1023; + p[24] = (w[7] >> 6) & 1023; + p[25] = (w[7] << 4) & 1023; + p[25] |= (w[8] >> 28); + p[26] = (w[8] >> 18) & 1023; + p[27] = (w[8] >> 8) & 1023; + p[28] = (w[8] << 2) & 1023; + p[28] |= (w[9] >> 30); + p[29] = (w[9] >> 20) & 1023; + p[30] = (w[9] >> 10) & 1023; + p[31] = (w[9]) & 1023; + } +} + + +void unpack11(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 11) + { + p[0] = (w[0] >> 21); + p[1] = (w[0] >> 10) & 2047; + p[2] = (w[0] << 1) & 2047; + p[2] |= (w[1] >> 31); + p[3] = (w[1] >> 20) & 2047; + p[4] = (w[1] >> 9) & 2047; + p[5] = (w[1] << 2) & 2047; + p[5] |= (w[2] >> 30); + p[6] = (w[2] >> 19) & 2047; + p[7] = (w[2] >> 8) & 2047; + p[8] = (w[2] << 3) & 2047; + p[8] |= (w[3] >> 29); + p[9] = (w[3] >> 18) & 2047; + p[10] = (w[3] >> 7) & 2047; + p[11] = (w[3] << 4) & 2047; + p[11] |= (w[4] >> 28); + p[12] = (w[4] >> 17) & 2047; + p[13] = (w[4] >> 6) & 2047; + p[14] = (w[4] << 5) & 2047; + p[14] |= (w[5] >> 27); + p[15] = (w[5] >> 16) & 2047; + p[16] = (w[5] >> 5) & 2047; + p[17] = (w[5] << 6) & 2047; + p[17] |= (w[6] >> 26); + p[18] = (w[6] >> 15) & 2047; + p[19] = (w[6] >> 4) & 2047; + p[20] = (w[6] << 7) & 2047; + p[20] |= (w[7] >> 25); + p[21] = (w[7] >> 14) & 2047; + p[22] = (w[7] >> 3) & 2047; + p[23] = (w[7] << 8) & 2047; + p[23] |= (w[8] >> 24); + p[24] = (w[8] >> 13) & 2047; + p[25] = (w[8] >> 2) & 2047; + p[26] = (w[8] << 9) & 2047; + p[26] |= (w[9] >> 23); + p[27] = (w[9] >> 12) & 2047; + p[28] = (w[9] >> 1) & 2047; + p[29] = (w[9] << 10) & 2047; + p[29] |= (w[10] >> 22); + p[30] = (w[10] >> 11) & 2047; + p[31] = (w[10]) & 2047; + } +} + + +void unpack12(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 12) + { + p[0] = (w[0] >> 20); + p[1] = (w[0] >> 8) & 4095; + p[2] = (w[0] << 4) & 4095; + p[2] |= (w[1] >> 28); + p[3] = (w[1] >> 16) & 4095; + p[4] = (w[1] >> 4) & 4095; + p[5] = (w[1] << 8) & 4095; + p[5] |= (w[2] >> 24); + p[6] = (w[2] >> 12) & 4095; + p[7] = (w[2]) & 4095; + p[8] = (w[3] >> 20); + p[9] = (w[3] >> 8) & 4095; + p[10] = (w[3] << 4) & 4095; + p[10] |= (w[4] >> 28); + p[11] = (w[4] >> 16) & 4095; + p[12] = (w[4] >> 4) & 4095; + p[13] = (w[4] << 8) & 4095; + p[13] |= (w[5] >> 24); + p[14] = (w[5] >> 12) & 4095; + p[15] = (w[5]) & 4095; + p[16] = (w[6] >> 20); + p[17] = (w[6] >> 8) & 4095; + p[18] = (w[6] << 4) & 4095; + p[18] |= (w[7] >> 28); + p[19] = (w[7] >> 16) & 4095; + p[20] = (w[7] >> 4) & 4095; + p[21] = (w[7] << 8) & 4095; + p[21] |= (w[8] >> 24); + p[22] = (w[8] >> 12) & 4095; + p[23] = (w[8]) & 4095; + p[24] = (w[9] >> 20); + p[25] = (w[9] >> 8) & 4095; + p[26] = (w[9] << 4) & 4095; + p[26] |= (w[10] >> 28); + p[27] = (w[10] >> 16) & 4095; + p[28] = (w[10] >> 4) & 4095; + p[29] = (w[10] << 8) & 4095; + p[29] |= (w[11] >> 24); + p[30] = (w[11] >> 12) & 4095; + p[31] = (w[11]) & 4095; + } +} + + +void unpack13(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 13) + { + p[0] = (w[0] >> 19); + p[1] = (w[0] >> 6) & 8191; + p[2] = (w[0] << 7) & 8191; + p[2] |= (w[1] >> 25); + p[3] = (w[1] >> 12) & 8191; + p[4] = (w[1] << 1) & 8191; + p[4] |= (w[2] >> 31); + p[5] = (w[2] >> 18) & 8191; + p[6] = (w[2] >> 5) & 8191; + p[7] = (w[2] << 8) & 8191; + p[7] |= (w[3] >> 24); + p[8] = (w[3] >> 11) & 8191; + p[9] = (w[3] << 2) & 8191; + p[9] |= (w[4] >> 30); + p[10] = (w[4] >> 17) & 8191; + p[11] = (w[4] >> 4) & 8191; + p[12] = (w[4] << 9) & 8191; + p[12] |= (w[5] >> 23); + p[13] = (w[5] >> 10) & 8191; + p[14] = (w[5] << 3) & 8191; + p[14] |= (w[6] >> 29); + p[15] = (w[6] >> 16) & 8191; + p[16] = (w[6] >> 3) & 8191; + p[17] = (w[6] << 10) & 8191; + p[17] |= (w[7] >> 22); + p[18] = (w[7] >> 9) & 8191; + p[19] = (w[7] << 4) & 8191; + p[19] |= (w[8] >> 28); + p[20] = (w[8] >> 15) & 8191; + p[21] = (w[8] >> 2) & 8191; + p[22] = (w[8] << 11) & 8191; + p[22] |= (w[9] >> 21); + p[23] = (w[9] >> 8) & 8191; + p[24] = (w[9] << 5) & 8191; + p[24] |= (w[10] >> 27); + p[25] = (w[10] >> 14) & 8191; + p[26] = (w[10] >> 1) & 8191; + p[27] = (w[10] << 12) & 8191; + p[27] |= (w[11] >> 20); + p[28] = (w[11] >> 7) & 8191; + p[29] = (w[11] << 6) & 8191; + p[29] |= (w[12] >> 26); + p[30] = (w[12] >> 13) & 8191; + p[31] = (w[12]) & 8191; + } +} + + +void unpack16(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 16) + { + p[0] = (w[0] >> 16); + p[1] = (w[0]) & 65535; + p[2] = (w[1] >> 16); + p[3] = (w[1]) & 65535; + p[4] = (w[2] >> 16); + p[5] = (w[2]) & 65535; + p[6] = (w[3] >> 16); + p[7] = (w[3]) & 65535; + p[8] = (w[4] >> 16); + p[9] = (w[4]) & 65535; + p[10] = (w[5] >> 16); + p[11] = (w[5]) & 65535; + p[12] = (w[6] >> 16); + p[13] = (w[6]) & 65535; + p[14] = (w[7] >> 16); + p[15] = (w[7]) & 65535; + p[16] = (w[8] >> 16); + p[17] = (w[8]) & 65535; + p[18] = (w[9] >> 16); + p[19] = (w[9]) & 65535; + p[20] = (w[10] >> 16); + p[21] = (w[10]) & 65535; + p[22] = (w[11] >> 16); + p[23] = (w[11]) & 65535; + p[24] = (w[12] >> 16); + p[25] = (w[12]) & 65535; + p[26] = (w[13] >> 16); + p[27] = (w[13]) & 65535; + p[28] = (w[14] >> 16); + p[29] = (w[14]) & 65535; + p[30] = (w[15] >> 16); + p[31] = (w[15]) & 65535; + } +} + + +void unpack20(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 20) + { + p[0] = (w[0] >> 12); + p[1] = (w[0] << 8) & ((1<<20)-1); + p[1] |= (w[1] >> 24); + p[2] = (w[1] >> 4) & ((1<<20)-1); + p[3] = (w[1] << 16) & ((1<<20)-1); + p[3] |= (w[2] >> 16); + p[4] = (w[2] << 4) & ((1<<20)-1); + p[4] |= (w[3] >> 28); + p[5] = (w[3] >> 8) & ((1<<20)-1); + p[6] = (w[3] << 12) & ((1<<20)-1); + p[6] |= (w[4] >> 20); + p[7] = (w[4]) & ((1<<20)-1); + p[8] = (w[5] >> 12); + p[9] = (w[5] << 8) & ((1<<20)-1); + p[9] |= (w[6] >> 24); + p[10] = (w[6] >> 4) & ((1<<20)-1); + p[11] = (w[6] << 16) & ((1<<20)-1); + p[11] |= (w[7] >> 16); + p[12] = (w[7] << 4) & ((1<<20)-1); + p[12] |= (w[8] >> 28); + p[13] = (w[8] >> 8) & ((1<<20)-1); + p[14] = (w[8] << 12) & ((1<<20)-1); + p[14] |= (w[9] >> 20); + p[15] = (w[9]) & ((1<<20)-1); + p[16] = (w[10] >> 12); + p[17] = (w[10] << 8) & ((1<<20)-1); + p[17] |= (w[11] >> 24); + p[18] = (w[11] >> 4) & ((1<<20)-1); + p[19] = (w[11] << 16) & ((1<<20)-1); + p[19] |= (w[12] >> 16); + p[20] = (w[12] << 4) & ((1<<20)-1); + p[20] |= (w[13] >> 28); + p[21] = (w[13] >> 8) & ((1<<20)-1); + p[22] = (w[13] << 12) & ((1<<20)-1); + p[22] |= (w[14] >> 20); + p[23] = (w[14]) & ((1<<20)-1); + p[24] = (w[15] >> 12); + p[25] = (w[15] << 8) & ((1<<20)-1); + p[25] |= (w[16] >> 24); + p[26] = (w[16] >> 4) & ((1<<20)-1); + p[27] = (w[16] << 16) & ((1<<20)-1); + p[27] |= (w[17] >> 16); + p[28] = (w[17] << 4) & ((1<<20)-1); + p[28] |= (w[18] >> 28); + p[29] = (w[18] >> 8) & ((1<<20)-1); + p[30] = (w[18] << 12) & ((1<<20)-1); + p[30] |= (w[19] >> 20); + p[31] = (w[19]) & ((1<<20)-1); + } +} + + +void unpack32(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 32) + { + p[0] = w[0]; + p[1] = w[1]; + p[2] = w[2]; + p[3] = w[3]; + p[4] = w[4]; + p[5] = w[5]; + p[6] = w[6]; + p[7] = w[7]; + p[8] = w[8]; + p[9] = w[9]; + p[10] = w[10]; + p[11] = w[11]; + p[12] = w[12]; + p[13] = w[13]; + p[14] = w[14]; + p[15] = w[15]; + p[16] = w[16]; + p[17] = w[17]; + p[18] = w[18]; + p[19] = w[19]; + p[20] = w[20]; + p[21] = w[21]; + p[22] = w[22]; + p[23] = w[23]; + p[24] = w[24]; + p[25] = w[25]; + p[26] = w[26]; + p[27] = w[27]; + p[28] = w[28]; + p[29] = w[29]; + p[30] = w[30]; + p[31] = w[31]; + } +} + + +typedef void (*pf)(unsigned int *p, unsigned int *w); +pf unpack[17] = {unpack0, unpack1, unpack2, unpack3, unpack4, unpack5, + unpack6, unpack7, unpack8, unpack9, unpack10, unpack11, + unpack12, unpack13, unpack16, unpack20, unpack32}; + diff --git a/aux/simdcomp/bitpacka.c b/aux/simdcomp/bitpacka.c new file mode 100644 index 0000000..d23507d --- /dev/null +++ b/aux/simdcomp/bitpacka.c @@ -0,0 +1,17773 @@ +#include "bitpacka.h" +#define INLINE inline +uint32_t * nullpacker(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + return out; +} + + const uint32_t * nullunpacker8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,8 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in++) ; + *out |= ( (*in++) ) << 1 ; + *out |= ( (*in++) ) << 2 ; + *out |= ( (*in++) ) << 3 ; + *out |= ( (*in++) ) << 4 ; + *out |= ( (*in++) ) << 5 ; + *out |= ( (*in++) ) << 6 ; + *out |= ( (*in++) ) << 7 ; + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in++) ; + *out |= ( (*in++) ) << 2 ; + *out |= ( (*in++) ) << 4 ; + *out |= ( (*in++) ) << 6 ; + *out |= ( (*in++) ) << 8 ; + *out |= ( (*in++) ) << 10 ; + *out |= ( (*in++) ) << 12 ; + *out |= ( (*in++) ) << 14 ; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask3_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask7_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask11_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask15_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask19_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask23_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask27_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask31_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + +#if 0 +#define OUTI(__x) *out++ +#define OUT(__x) *out +#define OUI out++ +#else +#define OUTI(__x) out[__x] +#define OUT(__x) out[__x] +#define OUI +#endif +const INLINE uint32_t * __fastunpack1_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) & 1; + OUTI( 1) = ( (*in) >> 1 ) & 1; + OUTI( 2) = ( (*in) >> 2 ) & 1; + OUTI( 3) = ( (*in) >> 3 ) & 1; + OUTI( 4) = ( (*in) >> 4 ) & 1; + OUTI( 5) = ( (*in) >> 5 ) & 1; + OUTI( 6) = ( (*in) >> 6 ) & 1; + OUTI( 7) = ( (*in) >> 7 ) & 1; + return in + 1; +} + +const INLINE uint32_t * __fastunpack2_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; + OUTI( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; + OUTI( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; + OUTI( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; + OUTI( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; + OUTI( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; + OUTI( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; + OUTI( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack3_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; + OUTI( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; + OUTI( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; + OUTI( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; + OUTI( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; + OUTI( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; + OUTI( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; + OUTI( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack4_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; + OUTI( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; + OUTI( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; + OUTI( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; + OUTI( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; + OUTI( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; + OUTI( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; + OUTI( 7) = ( (*in++) >> 28 ) ; + return in; +} + +const uint32_t * __fastunpack5_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; + OUTI( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; + OUTI( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; + OUTI( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; + OUTI( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; + OUTI( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; + OUT( 6) = ( (*in++) >> 30 ) ; + OUT( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + OUI; + OUTI( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack6_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; + OUTI( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; + OUTI( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; + OUTI( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; + OUTI( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; + OUT( 5) = ( (*in++) >> 30 ) ; + OUT( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + OUI; + OUTI( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; + OUTI( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack7_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; + OUTI( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; + OUTI( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; + OUTI( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; + OUT( 4) = ( (*in++) >> 28 ) ; + OUT( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + OUI; + OUTI( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; + OUTI( 6 ) = ( (*in) >> 10 ) % (1U << 7 ) ; + OUTI( 7 ) = ( (*in) >> 17 ) % (1U << 7 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack8_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; + OUTI( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; + OUTI( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; + OUTI( 3) = ( (*in++) >> 24 ) ; + OUTI( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; + OUTI( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; + OUTI( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; + OUTI( 7) = ( (*in++) >> 24 ) ; + return in; +} + +const INLINE uint32_t * __fastunpack9_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; + OUTI( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; + OUTI( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; + OUT( 3) = ( (*in++) >> 27 ) ; + OUT( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + OUI; + OUTI( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; + OUTI( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; + OUTI( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; + OUT( 7) = ( (*in++) >> 31 ) ; + OUT( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + OUI; + return in + 1; +} + +const INLINE uint32_t * __fastunpack10_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; + OUTI( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; + OUTI( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; + OUT( 3) = ( (*in++) >> 30 ) ; + OUT( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + OUI; + OUTI( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; + OUTI( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; + OUT( 6) = ( (*in++) >> 28 ) ; + OUT( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + OUI; + OUTI( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack11_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ((*in) >> 0 ) % (1U << 11 ) ; + OUTI( 1) = ((*in) >> 11 ) % (1U << 11 ) ; + OUT( 2) = ((*in++) >> 22 ) ; + OUT( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + OUI; + OUTI( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; + OUTI( 4) = ((*in) >> 12 ) % (1U << 11 ) ; + OUT( 5) = (*in++) >> 23; + OUT( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + OUI; + OUTI( 6) = ((*in) >> 2 ) % (1U << 11 ) ; + OUTI( 7) = ((*in) >> 13 ) % (1U << 11 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack12_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; + OUTI( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; + OUT( 2) = ( (*in++) >> 24 ) ; + OUT( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + OUI; + OUTI( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; + OUTI( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; + OUT( 5) = ( (*in++) >> 28 ) ; + OUT( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + OUI; + OUTI( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; + OUTI( 7) = ( (*in++) >> 20 ) ; + return in; +} + +const INLINE uint32_t * __fastunpack13_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; + OUTI( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; + OUT( 2) = ( (*in++) >> 26 ) ; + OUT( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + OUI; + OUTI( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; + OUT( 4) = ( (*in++) >> 20 ) ; + OUT( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + OUI; + OUTI( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; + OUTI( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; + OUT( 7) = ( (*in++) >> 27 ); + OUT( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + OUI; + return in + 1; +} + +const INLINE uint32_t * __fastunpack14_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out++ = ( (*in) >> 0 ) % (1U << 14 ) ; + *out++ = ( (*in) >> 14 ) % (1U << 14 ) ; + *out = ( (*in++) >> 28 ) ; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out++ = ( (*in) >> 10 ) % (1U << 14 ) ; + *out = ( (*in++) >> 24 ) ; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out++ = ( (*in) >> 6 ) % (1U << 14 ) ; + *out = ( (*in++) >> 20 ) ; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out++ = ( (*in) >> 2 ) % (1U << 14 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack15_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack16_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const INLINE uint32_t * __fastunpack17_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack18_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack19_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack20_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const INLINE uint32_t * __fastunpack21_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack22_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack23_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack27_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack31_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker8(in,out); + + case 1: + return __fastunpack1_8(in,out); + + case 2: + return __fastunpack2_8(in,out); + + case 3: + return __fastunpack3_8(in,out); + + case 4: + return __fastunpack4_8(in,out); + + case 5: + return __fastunpack5_8(in,out); + + case 6: + return __fastunpack6_8(in,out); + + case 7: + return __fastunpack7_8(in,out); + + case 8: + return __fastunpack8_8(in,out); + + case 9: + return __fastunpack9_8(in,out); + + case 10: + return __fastunpack10_8(in,out); + + case 11: + return __fastunpack11_8(in,out); + + case 12: + return __fastunpack12_8(in,out); + + case 13: + return __fastunpack13_8(in,out); + + case 14: + return __fastunpack14_8(in,out); + + case 15: + return __fastunpack15_8(in,out); + + case 16: + return __fastunpack16_8(in,out); + + case 17: + return __fastunpack17_8(in,out); + + case 18: + return __fastunpack18_8(in,out); + + case 19: + return __fastunpack19_8(in,out); + + case 20: + return __fastunpack20_8(in,out); + + case 21: + return __fastunpack21_8(in,out); + + case 22: + return __fastunpack22_8(in,out); + + case 23: + return __fastunpack23_8(in,out); + + case 24: + return __fastunpack24_8(in,out); + + case 25: + return __fastunpack25_8(in,out); + + case 26: + return __fastunpack26_8(in,out); + + case 27: + return __fastunpack27_8(in,out); + + case 28: + return __fastunpack28_8(in,out); + + case 29: + return __fastunpack29_8(in,out); + + case 30: + return __fastunpack30_8(in,out); + + case 31: + return __fastunpack31_8(in,out); + + case 32: + return __fastunpack32_8(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_8(in,out); + + case 2: + return __fastpackwithoutmask2_8(in,out); + + case 3: + return __fastpackwithoutmask3_8(in,out); + + case 4: + return __fastpackwithoutmask4_8(in,out); + + case 5: + return __fastpackwithoutmask5_8(in,out); + + case 6: + return __fastpackwithoutmask6_8(in,out); + + case 7: + return __fastpackwithoutmask7_8(in,out); + + case 8: + return __fastpackwithoutmask8_8(in,out); + + case 9: + return __fastpackwithoutmask9_8(in,out); + + case 10: + return __fastpackwithoutmask10_8(in,out); + + case 11: + return __fastpackwithoutmask11_8(in,out); + + case 12: + return __fastpackwithoutmask12_8(in,out); + + case 13: + return __fastpackwithoutmask13_8(in,out); + + case 14: + return __fastpackwithoutmask14_8(in,out); + + case 15: + return __fastpackwithoutmask15_8(in,out); + + case 16: + return __fastpackwithoutmask16_8(in,out); + + case 17: + return __fastpackwithoutmask17_8(in,out); + + case 18: + return __fastpackwithoutmask18_8(in,out); + + case 19: + return __fastpackwithoutmask19_8(in,out); + + case 20: + return __fastpackwithoutmask20_8(in,out); + + case 21: + return __fastpackwithoutmask21_8(in,out); + + case 22: + return __fastpackwithoutmask22_8(in,out); + + case 23: + return __fastpackwithoutmask23_8(in,out); + + case 24: + return __fastpackwithoutmask24_8(in,out); + + case 25: + return __fastpackwithoutmask25_8(in,out); + + case 26: + return __fastpackwithoutmask26_8(in,out); + + case 27: + return __fastpackwithoutmask27_8(in,out); + + case 28: + return __fastpackwithoutmask28_8(in,out); + + case 29: + return __fastpackwithoutmask29_8(in,out); + + case 30: + return __fastpackwithoutmask30_8(in,out); + + case 31: + return __fastpackwithoutmask31_8(in,out); + + case 32: + return __fastpackwithoutmask32_8(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,16 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask3_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask7_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask11_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask15_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask19_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask23_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask27_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask31_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + + + + +const uint32_t * __fastunpack1_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack2_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack3_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack4_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack5_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack6_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack7_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack8_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack9_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack10_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack11_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack12_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack13_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack14_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack15_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack16_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack17_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack18_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack19_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack20_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack22_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack23_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack27_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack31_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker16(in,out); + + case 1: + return __fastunpack1_16(in,out); + + case 2: + return __fastunpack2_16(in,out); + + case 3: + return __fastunpack3_16(in,out); + + case 4: + return __fastunpack4_16(in,out); + + case 5: + return __fastunpack5_16(in,out); + + case 6: + return __fastunpack6_16(in,out); + + case 7: + return __fastunpack7_16(in,out); + + case 8: + return __fastunpack8_16(in,out); + + case 9: + return __fastunpack9_16(in,out); + + case 10: + return __fastunpack10_16(in,out); + + case 11: + return __fastunpack11_16(in,out); + + case 12: + return __fastunpack12_16(in,out); + + case 13: + return __fastunpack13_16(in,out); + + case 14: + return __fastunpack14_16(in,out); + + case 15: + return __fastunpack15_16(in,out); + + case 16: + return __fastunpack16_16(in,out); + + case 17: + return __fastunpack17_16(in,out); + + case 18: + return __fastunpack18_16(in,out); + + case 19: + return __fastunpack19_16(in,out); + + case 20: + return __fastunpack20_16(in,out); + + case 21: + return __fastunpack21_16(in,out); + + case 22: + return __fastunpack22_16(in,out); + + case 23: + return __fastunpack23_16(in,out); + + case 24: + return __fastunpack24_16(in,out); + + case 25: + return __fastunpack25_16(in,out); + + case 26: + return __fastunpack26_16(in,out); + + case 27: + return __fastunpack27_16(in,out); + + case 28: + return __fastunpack28_16(in,out); + + case 29: + return __fastunpack29_16(in,out); + + case 30: + return __fastunpack30_16(in,out); + + case 31: + return __fastunpack31_16(in,out); + + case 32: + return __fastunpack32_16(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_16(in,out); + + case 2: + return __fastpackwithoutmask2_16(in,out); + + case 3: + return __fastpackwithoutmask3_16(in,out); + + case 4: + return __fastpackwithoutmask4_16(in,out); + + case 5: + return __fastpackwithoutmask5_16(in,out); + + case 6: + return __fastpackwithoutmask6_16(in,out); + + case 7: + return __fastpackwithoutmask7_16(in,out); + + case 8: + return __fastpackwithoutmask8_16(in,out); + + case 9: + return __fastpackwithoutmask9_16(in,out); + + case 10: + return __fastpackwithoutmask10_16(in,out); + + case 11: + return __fastpackwithoutmask11_16(in,out); + + case 12: + return __fastpackwithoutmask12_16(in,out); + + case 13: + return __fastpackwithoutmask13_16(in,out); + + case 14: + return __fastpackwithoutmask14_16(in,out); + + case 15: + return __fastpackwithoutmask15_16(in,out); + + case 16: + return __fastpackwithoutmask16_16(in,out); + + case 17: + return __fastpackwithoutmask17_16(in,out); + + case 18: + return __fastpackwithoutmask18_16(in,out); + + case 19: + return __fastpackwithoutmask19_16(in,out); + + case 20: + return __fastpackwithoutmask20_16(in,out); + + case 21: + return __fastpackwithoutmask21_16(in,out); + + case 22: + return __fastpackwithoutmask22_16(in,out); + + case 23: + return __fastpackwithoutmask23_16(in,out); + + case 24: + return __fastpackwithoutmask24_16(in,out); + + case 25: + return __fastpackwithoutmask25_16(in,out); + + case 26: + return __fastpackwithoutmask26_16(in,out); + + case 27: + return __fastpackwithoutmask27_16(in,out); + + case 28: + return __fastpackwithoutmask28_16(in,out); + + case 29: + return __fastpackwithoutmask29_16(in,out); + + case 30: + return __fastpackwithoutmask30_16(in,out); + + case 31: + return __fastpackwithoutmask31_16(in,out); + + case 32: + return __fastpackwithoutmask32_16(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,24 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask3_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask7_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask11_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask15_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask19_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask23_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask27_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask31_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + + + + +const uint32_t * __fastunpack1_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + *out = ( (*in) >> 16 ) & 1 ; + out++; + *out = ( (*in) >> 17 ) & 1 ; + out++; + *out = ( (*in) >> 18 ) & 1 ; + out++; + *out = ( (*in) >> 19 ) & 1 ; + out++; + *out = ( (*in) >> 20 ) & 1 ; + out++; + *out = ( (*in) >> 21 ) & 1 ; + out++; + *out = ( (*in) >> 22 ) & 1 ; + out++; + *out = ( (*in) >> 23 ) & 1 ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack2_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack3_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 3 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack4_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack5_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 5 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack6_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack7_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 7 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack8_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack9_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 9 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack10_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack11_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack12_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack13_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 13 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack14_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack15_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack16_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack17_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 17 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack18_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack19_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack20_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack22_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack23_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack27_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack31_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker24(in,out); + + case 1: + return __fastunpack1_24(in,out); + + case 2: + return __fastunpack2_24(in,out); + + case 3: + return __fastunpack3_24(in,out); + + case 4: + return __fastunpack4_24(in,out); + + case 5: + return __fastunpack5_24(in,out); + + case 6: + return __fastunpack6_24(in,out); + + case 7: + return __fastunpack7_24(in,out); + + case 8: + return __fastunpack8_24(in,out); + + case 9: + return __fastunpack9_24(in,out); + + case 10: + return __fastunpack10_24(in,out); + + case 11: + return __fastunpack11_24(in,out); + + case 12: + return __fastunpack12_24(in,out); + + case 13: + return __fastunpack13_24(in,out); + + case 14: + return __fastunpack14_24(in,out); + + case 15: + return __fastunpack15_24(in,out); + + case 16: + return __fastunpack16_24(in,out); + + case 17: + return __fastunpack17_24(in,out); + + case 18: + return __fastunpack18_24(in,out); + + case 19: + return __fastunpack19_24(in,out); + + case 20: + return __fastunpack20_24(in,out); + + case 21: + return __fastunpack21_24(in,out); + + case 22: + return __fastunpack22_24(in,out); + + case 23: + return __fastunpack23_24(in,out); + + case 24: + return __fastunpack24_24(in,out); + + case 25: + return __fastunpack25_24(in,out); + + case 26: + return __fastunpack26_24(in,out); + + case 27: + return __fastunpack27_24(in,out); + + case 28: + return __fastunpack28_24(in,out); + + case 29: + return __fastunpack29_24(in,out); + + case 30: + return __fastunpack30_24(in,out); + + case 31: + return __fastunpack31_24(in,out); + + case 32: + return __fastunpack32_24(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_24(in,out); + + case 2: + return __fastpackwithoutmask2_24(in,out); + + case 3: + return __fastpackwithoutmask3_24(in,out); + + case 4: + return __fastpackwithoutmask4_24(in,out); + + case 5: + return __fastpackwithoutmask5_24(in,out); + + case 6: + return __fastpackwithoutmask6_24(in,out); + + case 7: + return __fastpackwithoutmask7_24(in,out); + + case 8: + return __fastpackwithoutmask8_24(in,out); + + case 9: + return __fastpackwithoutmask9_24(in,out); + + case 10: + return __fastpackwithoutmask10_24(in,out); + + case 11: + return __fastpackwithoutmask11_24(in,out); + + case 12: + return __fastpackwithoutmask12_24(in,out); + + case 13: + return __fastpackwithoutmask13_24(in,out); + + case 14: + return __fastpackwithoutmask14_24(in,out); + + case 15: + return __fastpackwithoutmask15_24(in,out); + + case 16: + return __fastpackwithoutmask16_24(in,out); + + case 17: + return __fastpackwithoutmask17_24(in,out); + + case 18: + return __fastpackwithoutmask18_24(in,out); + + case 19: + return __fastpackwithoutmask19_24(in,out); + + case 20: + return __fastpackwithoutmask20_24(in,out); + + case 21: + return __fastpackwithoutmask21_24(in,out); + + case 22: + return __fastpackwithoutmask22_24(in,out); + + case 23: + return __fastpackwithoutmask23_24(in,out); + + case 24: + return __fastpackwithoutmask24_24(in,out); + + case 25: + return __fastpackwithoutmask25_24(in,out); + + case 26: + return __fastpackwithoutmask26_24(in,out); + + case 27: + return __fastpackwithoutmask27_24(in,out); + + case 28: + return __fastpackwithoutmask28_24(in,out); + + case 29: + return __fastpackwithoutmask29_24(in,out); + + case 30: + return __fastpackwithoutmask30_24(in,out); + + case 31: + return __fastpackwithoutmask31_24(in,out); + + case 32: + return __fastpackwithoutmask32_24(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,32 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask2_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask3_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask4_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 5 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask6_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask7_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 7 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask8_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 9 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 9 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask10_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask11_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 11 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 11 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask12_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 13 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 13 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 13 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask14_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask15_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 15 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 15 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 15 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask16_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 17 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 17 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 17 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 17 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask18_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask19_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 19 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 19 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 19 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 19 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask20_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 21 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 21 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 21 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 21 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 21 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask22_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask23_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 23 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 23 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 23 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 23 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 23 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask24_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 25 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 25 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 25 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 25 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 25 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 25 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask26_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask27_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 27 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 27 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 27 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 27 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 27 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 27 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask28_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 29 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 29 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 29 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 29 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 29 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 29 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 29 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask30_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask31_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 31 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 31 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 31 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 31 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 31 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + *out = ( (*in) ) >> ( 31 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + *out = ( (*in) ) >> ( 31 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask32_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + +#if 1 +#define DST(__x) out[__x] +#define DSI +#else +#define DST(__x) *out++ +#define DSI +#endif + +const uint32_t * __fastunpack1_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) & 1 ; + DSI; + DST( 1) = ( (*in) >> 1 ) & 1 ; + DSI; + DST( 2) = ( (*in) >> 2 ) & 1 ; + DSI; + DST( 3) = ( (*in) >> 3 ) & 1 ; + DSI; + DST( 4) = ( (*in) >> 4 ) & 1 ; + DSI; + DST( 5) = ( (*in) >> 5 ) & 1 ; + DSI; + DST( 6) = ( (*in) >> 6 ) & 1 ; + DSI; + DST( 7) = ( (*in) >> 7 ) & 1 ; + DSI; + DST( 8) = ( (*in) >> 8 ) & 1 ; + DSI; + DST( 9) = ( (*in) >> 9 ) & 1 ; + DSI; + DST(10) = ( (*in) >> 10 ) & 1 ; + DSI; + DST(11) = ( (*in) >> 11 ) & 1 ; + DSI; + DST(12) = ( (*in) >> 12 ) & 1 ; + DSI; + DST(13) = ( (*in) >> 13 ) & 1 ; + DSI; + DST(14) = ( (*in) >> 14 ) & 1 ; + DSI; + DST(15) = ( (*in) >> 15 ) & 1 ; + DSI; + DST(16) = ( (*in) >> 16 ) & 1 ; + DSI; + DST(17) = ( (*in) >> 17 ) & 1 ; + DSI; + DST(18) = ( (*in) >> 18 ) & 1 ; + DSI; + DST(19) = ( (*in) >> 19 ) & 1 ; + DSI; + DST(20) = ( (*in) >> 20 ) & 1 ; + DSI; + DST(21) = ( (*in) >> 21 ) & 1 ; + DSI; + DST(22) = ( (*in) >> 22 ) & 1 ; + DSI; + DST(23) = ( (*in) >> 23 ) & 1 ; + DSI; + DST(24) = ( (*in) >> 24 ) & 1 ; + DSI; + DST(25) = ( (*in) >> 25 ) & 1 ; + DSI; + DST(26) = ( (*in) >> 26 ) & 1 ; + DSI; + DST(27) = ( (*in) >> 27 ) & 1 ; + DSI; + DST(28) = ( (*in) >> 28 ) & 1 ; + DSI; + DST(29) = ( (*in) >> 29 ) & 1 ; + DSI; + DST(30) = ( (*in) >> 30 ) & 1 ; + DSI; + DST(31) = ( (*in) >> 31 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack2_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; + DSI; + DST( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; + DSI; + DST( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; + DSI; + DST( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; + DSI; + DST( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; + DSI; + DST( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; + DSI; + DST( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; + DSI; + DST( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 2 ) ; + DSI; + DST( 9) = ( (*in) >> 18 ) % (1U << 2 ) ; + DSI; + DST(10) = ( (*in) >> 20 ) % (1U << 2 ) ; + DSI; + DST(11) = ( (*in) >> 22 ) % (1U << 2 ) ; + DSI; + DST(12) = ( (*in) >> 24 ) % (1U << 2 ) ; + DSI; + DST(13) = ( (*in) >> 26 ) % (1U << 2 ) ; + DSI; + DST(14) = ( (*in) >> 28 ) % (1U << 2 ) ; + DSI; + DST(15) = ( (*in) >> 30 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 2 ) ; + DSI; + DST(17) = ( (*in) >> 2 ) % (1U << 2 ) ; + DSI; + DST(18) = ( (*in) >> 4 ) % (1U << 2 ) ; + DSI; + DST(19) = ( (*in) >> 6 ) % (1U << 2 ) ; + DSI; + DST(20) = ( (*in) >> 8 ) % (1U << 2 ) ; + DSI; + DST(21) = ( (*in) >> 10 ) % (1U << 2 ) ; + DSI; + DST(22) = ( (*in) >> 12 ) % (1U << 2 ) ; + DSI; + DST(23) = ( (*in) >> 14 ) % (1U << 2 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 2 ) ; + DSI; + DST(25) = ( (*in) >> 18 ) % (1U << 2 ) ; + DSI; + DST(26) = ( (*in) >> 20 ) % (1U << 2 ) ; + DSI; + DST(27) = ( (*in) >> 22 ) % (1U << 2 ) ; + DSI; + DST(28) = ( (*in) >> 24 ) % (1U << 2 ) ; + DSI; + DST(29) = ( (*in) >> 26 ) % (1U << 2 ) ; + DSI; + DST(30) = ( (*in) >> 28 ) % (1U << 2 ) ; + DSI; + DST(31) = ( (*in) >> 30 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack3_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; + DSI; + DST( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; + DSI; + DST( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; + DSI; + DST( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; + DSI; + DST( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; + DSI; + DST( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; + DSI; + DST( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; + DSI; + DST( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) % (1U << 3 ) ; + DSI; + DST( 9) = ( (*in) >> 27 ) % (1U << 3 ) ; + DSI; + DST(10) = ( (*in) >> 30 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + DSI; + DST(11) = ( (*in) >> 1 ) % (1U << 3 ) ; + DSI; + DST(12) = ( (*in) >> 4 ) % (1U << 3 ) ; + DSI; + DST(13) = ( (*in) >> 7 ) % (1U << 3 ) ; + DSI; + DST(14) = ( (*in) >> 10 ) % (1U << 3 ) ; + DSI; + DST(15) = ( (*in) >> 13 ) % (1U << 3 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 3 ) ; + DSI; + DST(17) = ( (*in) >> 19 ) % (1U << 3 ) ; + DSI; + DST(18) = ( (*in) >> 22 ) % (1U << 3 ) ; + DSI; + DST(19) = ( (*in) >> 25 ) % (1U << 3 ) ; + DSI; + DST(20) = ( (*in) >> 28 ) % (1U << 3 ) ; + DSI; + DST(21) = ( (*in) >> 31 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + DSI; + DST(22) = ( (*in) >> 2 ) % (1U << 3 ) ; + DSI; + DST(23) = ( (*in) >> 5 ) % (1U << 3 ) ; + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 3 ) ; + DSI; + DST(25) = ( (*in) >> 11 ) % (1U << 3 ) ; + DSI; + DST(26) = ( (*in) >> 14 ) % (1U << 3 ) ; + DSI; + DST(27) = ( (*in) >> 17 ) % (1U << 3 ) ; + DSI; + DST(28) = ( (*in) >> 20 ) % (1U << 3 ) ; + DSI; + DST(29) = ( (*in) >> 23 ) % (1U << 3 ) ; + DSI; + DST(30) = ( (*in) >> 26 ) % (1U << 3 ) ; + DSI; + DST(31) = ( (*in) >> 29 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack4_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST( 7) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST( 9) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(10) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(11) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(12) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(13) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(14) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(15) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST(17) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(18) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(19) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(20) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(21) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(22) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(23) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST(25) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(26) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(27) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(28) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(29) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(30) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(31) = ( (*in) >> 28 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack5_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; + DSI; + DST( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; + DSI; + DST( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; + DSI; + DST( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; + DSI; + DST( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; + DSI; + DST( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; + DSI; + DST( 6) = ( (*in) >> 30 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + DSI; + DST( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 5 ) ; + DSI; + DST( 9) = ( (*in) >> 13 ) % (1U << 5 ) ; + DSI; + DST(10) = ( (*in) >> 18 ) % (1U << 5 ) ; + DSI; + DST(11) = ( (*in) >> 23 ) % (1U << 5 ) ; + DSI; + DST(12) = ( (*in) >> 28 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + DSI; + DST(13) = ( (*in) >> 1 ) % (1U << 5 ) ; + DSI; + DST(14) = ( (*in) >> 6 ) % (1U << 5 ) ; + DSI; + DST(15) = ( (*in) >> 11 ) % (1U << 5 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 5 ) ; + DSI; + DST(17) = ( (*in) >> 21 ) % (1U << 5 ) ; + DSI; + DST(18) = ( (*in) >> 26 ) % (1U << 5 ) ; + DSI; + DST(19) = ( (*in) >> 31 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + DSI; + DST(20) = ( (*in) >> 4 ) % (1U << 5 ) ; + DSI; + DST(21) = ( (*in) >> 9 ) % (1U << 5 ) ; + DSI; + DST(22) = ( (*in) >> 14 ) % (1U << 5 ) ; + DSI; + DST(23) = ( (*in) >> 19 ) % (1U << 5 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) % (1U << 5 ) ; + DSI; + DST(25) = ( (*in) >> 29 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); + DSI; + DST(26) = ( (*in) >> 2 ) % (1U << 5 ) ; + DSI; + DST(27) = ( (*in) >> 7 ) % (1U << 5 ) ; + DSI; + DST(28) = ( (*in) >> 12 ) % (1U << 5 ) ; + DSI; + DST(29) = ( (*in) >> 17 ) % (1U << 5 ) ; + DSI; + DST(30) = ( (*in) >> 22 ) % (1U << 5 ) ; + DSI; + DST(31) = ( (*in) >> 27 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack6_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; + DSI; + DST( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; + DSI; + DST( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; + DSI; + DST( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; + DSI; + DST( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; + DSI; + DST( 5) = ( (*in) >> 30 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + DSI; + DST( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; + DSI; + DST( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 6 ) ; + DSI; + DST( 9) = ( (*in) >> 22 ) % (1U << 6 ) ; + DSI; + DST(10) = ( (*in) >> 28 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + DSI; + DST(11) = ( (*in) >> 2 ) % (1U << 6 ) ; + DSI; + DST(12) = ( (*in) >> 8 ) % (1U << 6 ) ; + DSI; + DST(13) = ( (*in) >> 14 ) % (1U << 6 ) ; + DSI; + DST(14) = ( (*in) >> 20 ) % (1U << 6 ) ; + DSI; + DST(15) = ( (*in) >> 26 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 6 ) ; + DSI; + DST(17) = ( (*in) >> 6 ) % (1U << 6 ) ; + DSI; + DST(18) = ( (*in) >> 12 ) % (1U << 6 ) ; + DSI; + DST(19) = ( (*in) >> 18 ) % (1U << 6 ) ; + DSI; + DST(20) = ( (*in) >> 24 ) % (1U << 6 ) ; + DSI; + DST(21) = ( (*in) >> 30 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + DSI; + DST(22) = ( (*in) >> 4 ) % (1U << 6 ) ; + DSI; + DST(23) = ( (*in) >> 10 ) % (1U << 6 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 6 ) ; + DSI; + DST(25) = ( (*in) >> 22 ) % (1U << 6 ) ; + DSI; + DST(26) = ( (*in) >> 28 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + DSI; + DST(27) = ( (*in) >> 2 ) % (1U << 6 ) ; + DSI; + DST(28) = ( (*in) >> 8 ) % (1U << 6 ) ; + DSI; + DST(29) = ( (*in) >> 14 ) % (1U << 6 ) ; + DSI; + DST(30) = ( (*in) >> 20 ) % (1U << 6 ) ; + DSI; + DST(31) = ( (*in) >> 26 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack7_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; + DSI; + DST( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; + DSI; + DST( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; + DSI; + DST( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; + DSI; + DST( 4) = ( (*in) >> 28 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + DSI; + DST( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; + DSI; + DST( 6) = ( (*in) >> 10 ) % (1U << 7 ) ; + DSI; + DST( 7) = ( (*in) >> 17 ) % (1U << 7 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) % (1U << 7 ) ; + DSI; + DST( 9) = ( (*in) >> 31 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + DSI; + DST(10) = ( (*in) >> 6 ) % (1U << 7 ) ; + DSI; + DST(11) = ( (*in) >> 13 ) % (1U << 7 ) ; + DSI; + DST(12) = ( (*in) >> 20 ) % (1U << 7 ) ; + DSI; + DST(13) = ( (*in) >> 27 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + DSI; + DST(14) = ( (*in) >> 2 ) % (1U << 7 ) ; + DSI; + DST(15) = ( (*in) >> 9 ) % (1U << 7 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 7 ) ; + DSI; + DST(17) = ( (*in) >> 23 ) % (1U << 7 ) ; + DSI; + DST(18) = ( (*in) >> 30 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + DSI; + DST(19) = ( (*in) >> 5 ) % (1U << 7 ) ; + DSI; + DST(20) = ( (*in) >> 12 ) % (1U << 7 ) ; + DSI; + DST(21) = ( (*in) >> 19 ) % (1U << 7 ) ; + DSI; + DST(22) = ( (*in) >> 26 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + DSI; + DST(23) = ( (*in) >> 1 ) % (1U << 7 ) ; + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 7 ) ; + DSI; + DST(25) = ( (*in) >> 15 ) % (1U << 7 ) ; + DSI; + DST(26) = ( (*in) >> 22 ) % (1U << 7 ) ; + DSI; + DST(27) = ( (*in) >> 29 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); + DSI; + DST(28) = ( (*in) >> 4 ) % (1U << 7 ) ; + DSI; + DST(29) = ( (*in) >> 11 ) % (1U << 7 ) ; + DSI; + DST(30) = ( (*in) >> 18 ) % (1U << 7 ) ; + DSI; + DST(31) = ( (*in) >> 25 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack8_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST( 3) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST( 7) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 9) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(10) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(11) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(12) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(13) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(14) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(15) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(17) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(18) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(19) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(20) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(21) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(22) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(23) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(25) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(26) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(27) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(28) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(29) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(30) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(31) = ( (*in) >> 24 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack9_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; + DSI; + DST( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; + DSI; + DST( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; + DSI; + DST( 3) = ( (*in) >> 27 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + DSI; + DST( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; + DSI; + DST( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; + DSI; + DST( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; + DSI; + DST( 7) = ( (*in) >> 31 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 9 ) ; + DSI; + DST( 9) = ( (*in) >> 17 ) % (1U << 9 ) ; + DSI; + DST(10) = ( (*in) >> 26 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + DSI; + DST(11) = ( (*in) >> 3 ) % (1U << 9 ) ; + DSI; + DST(12) = ( (*in) >> 12 ) % (1U << 9 ) ; + DSI; + DST(13) = ( (*in) >> 21 ) % (1U << 9 ) ; + DSI; + DST(14) = ( (*in) >> 30 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + DSI; + DST(15) = ( (*in) >> 7 ) % (1U << 9 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 9 ) ; + DSI; + DST(17) = ( (*in) >> 25 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + DSI; + DST(18) = ( (*in) >> 2 ) % (1U << 9 ) ; + DSI; + DST(19) = ( (*in) >> 11 ) % (1U << 9 ) ; + DSI; + DST(20) = ( (*in) >> 20 ) % (1U << 9 ) ; + DSI; + DST(21) = ( (*in) >> 29 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + DSI; + DST(22) = ( (*in) >> 6 ) % (1U << 9 ) ; + DSI; + DST(23) = ( (*in) >> 15 ) % (1U << 9 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); + DSI; + DST(25) = ( (*in) >> 1 ) % (1U << 9 ) ; + DSI; + DST(26) = ( (*in) >> 10 ) % (1U << 9 ) ; + DSI; + DST(27) = ( (*in) >> 19 ) % (1U << 9 ) ; + DSI; + DST(28) = ( (*in) >> 28 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); + DSI; + DST(29) = ( (*in) >> 5 ) % (1U << 9 ) ; + DSI; + DST(30) = ( (*in) >> 14 ) % (1U << 9 ) ; + DSI; + DST(31) = ( (*in) >> 23 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack10_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; + DSI; + DST( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; + DSI; + DST( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; + DSI; + DST( 3) = ( (*in) >> 30 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + DSI; + DST( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; + DSI; + DST( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; + DSI; + DST( 6) = ( (*in) >> 28 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + DSI; + DST( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 10 ) ; + DSI; + DST( 9) = ( (*in) >> 26 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + DSI; + DST(10) = ( (*in) >> 4 ) % (1U << 10 ) ; + DSI; + DST(11) = ( (*in) >> 14 ) % (1U << 10 ) ; + DSI; + DST(12) = ( (*in) >> 24 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + DSI; + DST(13) = ( (*in) >> 2 ) % (1U << 10 ) ; + DSI; + DST(14) = ( (*in) >> 12 ) % (1U << 10 ) ; + DSI; + DST(15) = ( (*in) >> 22 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 10 ) ; + DSI; + DST(17) = ( (*in) >> 10 ) % (1U << 10 ) ; + DSI; + DST(18) = ( (*in) >> 20 ) % (1U << 10 ) ; + DSI; + DST(19) = ( (*in) >> 30 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + DSI; + DST(20) = ( (*in) >> 8 ) % (1U << 10 ) ; + DSI; + DST(21) = ( (*in) >> 18 ) % (1U << 10 ) ; + DSI; + DST(22) = ( (*in) >> 28 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + DSI; + DST(23) = ( (*in) >> 6 ) % (1U << 10 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 10 ) ; + DSI; + DST(25) = ( (*in) >> 26 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + DSI; + DST(26) = ( (*in) >> 4 ) % (1U << 10 ) ; + DSI; + DST(27) = ( (*in) >> 14 ) % (1U << 10 ) ; + DSI; + DST(28) = ( (*in) >> 24 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + DSI; + DST(29) = ( (*in) >> 2 ) % (1U << 10 ) ; + DSI; + DST(30) = ( (*in) >> 12 ) % (1U << 10 ) ; + DSI; + DST(31) = ( (*in) >> 22 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack11_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 11 ) ; + DSI; + DST( 1) = ( (*in) >> 11 ) % (1U << 11 ) ; + DSI; + DST( 2) = ( (*in) >> 22 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + DSI; + DST( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; + DSI; + DST( 4) = ( (*in) >> 12 ) % (1U << 11 ) ; + DSI; + DST( 5) = ( (*in) >> 23 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + DSI; + DST( 6) = ( (*in) >> 2 ) % (1U << 11 ) ; + DSI; + DST( 7) = ( (*in) >> 13 ) % (1U << 11 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) ; + ++in; + DST( 8) |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + DSI; + DST( 9) = ( (*in) >> 3 ) % (1U << 11 ) ; + DSI; + DST(10) = ( (*in) >> 14 ) % (1U << 11 ) ; + DSI; + DST(11) = ( (*in) >> 25 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + DSI; + DST(12) = ( (*in) >> 4 ) % (1U << 11 ) ; + DSI; + DST(13) = ( (*in) >> 15 ) % (1U << 11 ) ; + DSI; + DST(14) = ( (*in) >> 26 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + DSI; + DST(15) = ( (*in) >> 5 ) % (1U << 11 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 11 ) ; + DSI; + DST(17) = ( (*in) >> 27 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + DSI; + DST(18) = ( (*in) >> 6 ) % (1U << 11 ) ; + DSI; + DST(19) = ( (*in) >> 17 ) % (1U << 11 ) ; + DSI; + DST(20) = ( (*in) >> 28 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + DSI; + DST(21) = ( (*in) >> 7 ) % (1U << 11 ) ; + DSI; + DST(22) = ( (*in) >> 18 ) % (1U << 11 ) ; + DSI; + DST(23) = ( (*in) >> 29 ) ; + ++in; + DST(23) |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 11 ) ; + DSI; + DST(25) = ( (*in) >> 19 ) % (1U << 11 ) ; + DSI; + DST(26) = ( (*in) >> 30 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); + DSI; + DST(27) = ( (*in) >> 9 ) % (1U << 11 ) ; + DSI; + DST(28) = ( (*in) >> 20 ) % (1U << 11 ) ; + DSI; + DST(29) = ( (*in) >> 31 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); + DSI; + DST(30) = ( (*in) >> 10 ) % (1U << 11 ) ; + DSI; + DST(31) = ( (*in) >> 21 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack12_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST( 2) = ( (*in) >> 24 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST( 5) = ( (*in) >> 28 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST( 7) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST( 9) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(10) = ( (*in) >> 24 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(11) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(12) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(13) = ( (*in) >> 28 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(14) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(15) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST(17) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(18) = ( (*in) >> 24 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(19) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(20) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(21) = ( (*in) >> 28 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(22) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(23) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST(25) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(26) = ( (*in) >> 24 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(27) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(28) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(29) = ( (*in) >> 28 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(30) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(31) = ( (*in) >> 20 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack13_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; + DSI; + DST( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; + DSI; + DST( 2) = ( (*in) >> 26 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + DSI; + DST( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; + DSI; + DST( 4) = ( (*in) >> 20 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + DSI; + DST( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; + DSI; + DST( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; + DSI; + DST( 7) = ( (*in) >> 27 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 13 ) ; + DSI; + DST( 9) = ( (*in) >> 21 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + DSI; + DST(10) = ( (*in) >> 2 ) % (1U << 13 ) ; + DSI; + DST(11) = ( (*in) >> 15 ) % (1U << 13 ) ; + DSI; + DST(12) = ( (*in) >> 28 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + DSI; + DST(13) = ( (*in) >> 9 ) % (1U << 13 ) ; + DSI; + DST(14) = ( (*in) >> 22 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + DSI; + DST(15) = ( (*in) >> 3 ) % (1U << 13 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 13 ) ; + DSI; + DST(17) = ( (*in) >> 29 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + DSI; + DST(18) = ( (*in) >> 10 ) % (1U << 13 ) ; + DSI; + DST(19) = ( (*in) >> 23 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + DSI; + DST(20) = ( (*in) >> 4 ) % (1U << 13 ) ; + DSI; + DST(21) = ( (*in) >> 17 ) % (1U << 13 ) ; + DSI; + DST(22) = ( (*in) >> 30 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + DSI; + DST(23) = ( (*in) >> 11 ) % (1U << 13 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); + DSI; + DST(25) = ( (*in) >> 5 ) % (1U << 13 ) ; + DSI; + DST(26) = ( (*in) >> 18 ) % (1U << 13 ) ; + DSI; + DST(27) = ( (*in) >> 31 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); + DSI; + DST(28) = ( (*in) >> 12 ) % (1U << 13 ) ; + DSI; + DST(29) = ( (*in) >> 25 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); + DSI; + DST(30) = ( (*in) >> 6 ) % (1U << 13 ) ; + DSI; + DST(31) = ( (*in) >> 19 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack14_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 14 ) ; + DSI; + DST( 1) = ( (*in) >> 14 ) % (1U << 14 ) ; + DSI; + DST( 2) = ( (*in) >> 28 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + DSI; + DST( 3) = ( (*in) >> 10 ) % (1U << 14 ) ; + DSI; + DST( 4) = ( (*in) >> 24 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + DSI; + DST( 5) = ( (*in) >> 6 ) % (1U << 14 ) ; + DSI; + DST( 6) = ( (*in) >> 20 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + DSI; + DST( 7) = ( (*in) >> 2 ) % (1U << 14 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 14 ) ; + DSI; + DST( 9) = ( (*in) >> 30 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + DSI; + DST(10) = ( (*in) >> 12 ) % (1U << 14 ) ; + DSI; + DST(11) = ( (*in) >> 26 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + DSI; + DST(12) = ( (*in) >> 8 ) % (1U << 14 ) ; + DSI; + DST(13) = ( (*in) >> 22 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + DSI; + DST(14) = ( (*in) >> 4 ) % (1U << 14 ) ; + DSI; + DST(15) = ( (*in) >> 18 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 14 ) ; + DSI; + DST(17) = ( (*in) >> 14 ) % (1U << 14 ) ; + DSI; + DST(18) = ( (*in) >> 28 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + DSI; + DST(19) = ( (*in) >> 10 ) % (1U << 14 ) ; + DSI; + DST(20) = ( (*in) >> 24 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + DSI; + DST(21) = ( (*in) >> 6 ) % (1U << 14 ) ; + DSI; + DST(22) = ( (*in) >> 20 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + DSI; + DST(23) = ( (*in) >> 2 ) % (1U << 14 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 14 ) ; + DSI; + DST(25) = ( (*in) >> 30 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + DSI; + DST(26) = ( (*in) >> 12 ) % (1U << 14 ) ; + DSI; + DST(27) = ( (*in) >> 26 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + DSI; + DST(28) = ( (*in) >> 8 ) % (1U << 14 ) ; + DSI; + DST(29) = ( (*in) >> 22 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + DSI; + DST(30) = ( (*in) >> 4 ) % (1U << 14 ) ; + DSI; + DST(31) = ( (*in) >> 18 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack15_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 15 ) ; + DSI; + DST( 1) = ( (*in) >> 15 ) % (1U << 15 ) ; + DSI; + DST( 2) = ( (*in) >> 30 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + DSI; + DST( 3) = ( (*in) >> 13 ) % (1U << 15 ) ; + DSI; + DST( 4) = ( (*in) >> 28 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + DSI; + DST( 5) = ( (*in) >> 11 ) % (1U << 15 ) ; + DSI; + DST( 6) = ( (*in) >> 26 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + DSI; + DST( 7) = ( (*in) >> 9 ) % (1U << 15 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) ; + ++in; + DST( 8) |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + DSI; + DST( 9) = ( (*in) >> 7 ) % (1U << 15 ) ; + DSI; + DST(10) = ( (*in) >> 22 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + DSI; + DST(11) = ( (*in) >> 5 ) % (1U << 15 ) ; + DSI; + DST(12) = ( (*in) >> 20 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + DSI; + DST(13) = ( (*in) >> 3 ) % (1U << 15 ) ; + DSI; + DST(14) = ( (*in) >> 18 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + DSI; + DST(15) = ( (*in) >> 1 ) % (1U << 15 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 15 ) ; + DSI; + DST(17) = ( (*in) >> 31 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + DSI; + DST(18) = ( (*in) >> 14 ) % (1U << 15 ) ; + DSI; + DST(19) = ( (*in) >> 29 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + DSI; + DST(20) = ( (*in) >> 12 ) % (1U << 15 ) ; + DSI; + DST(21) = ( (*in) >> 27 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + DSI; + DST(22) = ( (*in) >> 10 ) % (1U << 15 ) ; + DSI; + DST(23) = ( (*in) >> 25 ) ; + ++in; + DST(23) |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 15 ) ; + DSI; + DST(25) = ( (*in) >> 23 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); + DSI; + DST(26) = ( (*in) >> 6 ) % (1U << 15 ) ; + DSI; + DST(27) = ( (*in) >> 21 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); + DSI; + DST(28) = ( (*in) >> 4 ) % (1U << 15 ) ; + DSI; + DST(29) = ( (*in) >> 19 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); + DSI; + DST(30) = ( (*in) >> 2 ) % (1U << 15 ) ; + DSI; + DST(31) = ( (*in) >> 17 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack16_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 1) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 2) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 3) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 4) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 5) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 6) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 7) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 9) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(10) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(11) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(12) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(13) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(14) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(15) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(17) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(18) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(19) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(20) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(21) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(22) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(23) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(25) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(26) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(27) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(28) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(29) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(30) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(31) = ( (*in) >> 16 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack17_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 17 ) ; + DSI; + DST( 1) = ( (*in) >> 17 ) ; + ++in; + DST( 1) |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + DSI; + DST( 2) = ( (*in) >> 2 ) % (1U << 17 ) ; + DSI; + DST( 3) = ( (*in) >> 19 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + DSI; + DST( 4) = ( (*in) >> 4 ) % (1U << 17 ) ; + DSI; + DST( 5) = ( (*in) >> 21 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + DSI; + DST( 6) = ( (*in) >> 6 ) % (1U << 17 ) ; + DSI; + DST( 7) = ( (*in) >> 23 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 17 ) ; + DSI; + DST( 9) = ( (*in) >> 25 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + DSI; + DST(10) = ( (*in) >> 10 ) % (1U << 17 ) ; + DSI; + DST(11) = ( (*in) >> 27 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + DSI; + DST(12) = ( (*in) >> 12 ) % (1U << 17 ) ; + DSI; + DST(13) = ( (*in) >> 29 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + DSI; + DST(14) = ( (*in) >> 14 ) % (1U << 17 ) ; + DSI; + DST(15) = ( (*in) >> 31 ) ; + ++in; + DST(15) |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + DSI; + DST(16) = ( (*in) >> 16 ) ; + ++in; + DST(16) |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + DSI; + DST(17) = ( (*in) >> 1 ) % (1U << 17 ) ; + DSI; + DST(18) = ( (*in) >> 18 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + DSI; + DST(19) = ( (*in) >> 3 ) % (1U << 17 ) ; + DSI; + DST(20) = ( (*in) >> 20 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + DSI; + DST(21) = ( (*in) >> 5 ) % (1U << 17 ) ; + DSI; + DST(22) = ( (*in) >> 22 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + DSI; + DST(23) = ( (*in) >> 7 ) % (1U << 17 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); + DSI; + DST(25) = ( (*in) >> 9 ) % (1U << 17 ) ; + DSI; + DST(26) = ( (*in) >> 26 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); + DSI; + DST(27) = ( (*in) >> 11 ) % (1U << 17 ) ; + DSI; + DST(28) = ( (*in) >> 28 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); + DSI; + DST(29) = ( (*in) >> 13 ) % (1U << 17 ) ; + DSI; + DST(30) = ( (*in) >> 30 ) ; + ++in; + DST(30) |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); + DSI; + DST(31) = ( (*in) >> 15 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack18_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack19_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack20_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack22_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack23_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack24_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack26_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack27_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack28_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack30_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack31_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); + out++; + *out = ( (*in) >> 1 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack32_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker32(in,out); + + case 1: + return __fastunpack1_32(in,out); + + case 2: + return __fastunpack2_32(in,out); + + case 3: + return __fastunpack3_32(in,out); + + case 4: + return __fastunpack4_32(in,out); + + case 5: + return __fastunpack5_32(in,out); + + case 6: + return __fastunpack6_32(in,out); + + case 7: + return __fastunpack7_32(in,out); + + case 8: + return __fastunpack8_32(in,out); + + case 9: + return __fastunpack9_32(in,out); + + case 10: + return __fastunpack10_32(in,out); + + case 11: + return __fastunpack11_32(in,out); + + case 12: + return __fastunpack12_32(in,out); + + case 13: + return __fastunpack13_32(in,out); + + case 14: + return __fastunpack14_32(in,out); + + case 15: + return __fastunpack15_32(in,out); + + case 16: + return __fastunpack16_32(in,out); + + case 17: + return __fastunpack17_32(in,out); + + case 18: + return __fastunpack18_32(in,out); + + case 19: + return __fastunpack19_32(in,out); + + case 20: + return __fastunpack20_32(in,out); + + case 21: + return __fastunpack21_32(in,out); + + case 22: + return __fastunpack22_32(in,out); + + case 23: + return __fastunpack23_32(in,out); + + case 24: + return __fastunpack24_32(in,out); + + case 25: + return __fastunpack25_32(in,out); + + case 26: + return __fastunpack26_32(in,out); + + case 27: + return __fastunpack27_32(in,out); + + case 28: + return __fastunpack28_32(in,out); + + case 29: + return __fastunpack29_32(in,out); + + case 30: + return __fastunpack30_32(in,out); + + case 31: + return __fastunpack31_32(in,out); + + case 32: + return __fastunpack32_32(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_32(in,out); + + case 2: + return __fastpackwithoutmask2_32(in,out); + + case 3: + return __fastpackwithoutmask3_32(in,out); + + case 4: + return __fastpackwithoutmask4_32(in,out); + + case 5: + return __fastpackwithoutmask5_32(in,out); + + case 6: + return __fastpackwithoutmask6_32(in,out); + + case 7: + return __fastpackwithoutmask7_32(in,out); + + case 8: + return __fastpackwithoutmask8_32(in,out); + + case 9: + return __fastpackwithoutmask9_32(in,out); + + case 10: + return __fastpackwithoutmask10_32(in,out); + + case 11: + return __fastpackwithoutmask11_32(in,out); + + case 12: + return __fastpackwithoutmask12_32(in,out); + + case 13: + return __fastpackwithoutmask13_32(in,out); + + case 14: + return __fastpackwithoutmask14_32(in,out); + + case 15: + return __fastpackwithoutmask15_32(in,out); + + case 16: + return __fastpackwithoutmask16_32(in,out); + + case 17: + return __fastpackwithoutmask17_32(in,out); + + case 18: + return __fastpackwithoutmask18_32(in,out); + + case 19: + return __fastpackwithoutmask19_32(in,out); + + case 20: + return __fastpackwithoutmask20_32(in,out); + + case 21: + return __fastpackwithoutmask21_32(in,out); + + case 22: + return __fastpackwithoutmask22_32(in,out); + + case 23: + return __fastpackwithoutmask23_32(in,out); + + case 24: + return __fastpackwithoutmask24_32(in,out); + + case 25: + return __fastpackwithoutmask25_32(in,out); + + case 26: + return __fastpackwithoutmask26_32(in,out); + + case 27: + return __fastpackwithoutmask27_32(in,out); + + case 28: + return __fastpackwithoutmask28_32(in,out); + + case 29: + return __fastpackwithoutmask29_32(in,out); + + case 30: + return __fastpackwithoutmask30_32(in,out); + + case 31: + return __fastpackwithoutmask31_32(in,out); + + case 32: + return __fastpackwithoutmask32_32(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + diff --git a/aux/simdcomp/bitpacka.h b/aux/simdcomp/bitpacka.h new file mode 100644 index 0000000..5efce66 --- /dev/null +++ b/aux/simdcomp/bitpacka.h @@ -0,0 +1,28 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef BITPACKINGALIGNED +#define BITPACKINGALIGNED +#include +#include +#include + +const uint32_t * fastunpack_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); +uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +const uint32_t * fastunpack_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); +uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +const uint32_t * fastunpack_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); +uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +const uint32_t * fastunpack_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + + + +#endif // BITPACKINGALIGNED diff --git a/aux/simdcomp/bitpacka.o b/aux/simdcomp/bitpacka.o new file mode 100644 index 0000000000000000000000000000000000000000..a778450b4bf09c5bc1175a91bc7e30d6c30847e5 GIT binary patch literal 116792 zcmb<-^>JfjWMqH=Mg}_u1P><4z)*1nA?g4Yc3|LV;9>CS{BSreElrPKz6C7m(fPyU zI2(xRaomjqN((?~2@nlZj!@t2&e9#u(d{q5c;Mh8CeB0M?mPz{vT%zZe89|k$fMVd zh4WOmzsSLdtRPzA;6pYLEpzZ8JLjoRvBPN-;Q9|fU^@7S1!@2X$N)AJ1GqpMB@RC1 z0nsuCAM$|>K-SNE@DUqSKMzPhCyIW4kUoin4~0Oq%)y7E82VWbKH`At7Xaz!LD4S) z(kF57p#+GQIrvZpRlmf+2doDlae?)NH1mU40tX+6fM}6}4`Om_g=9fXxA$Cy3L$gAX_kJ`wo`Ug@K{lADmu5sf6?3BO%UH+~S=F558bM_)r=gupmPO zKndYOceu#GN4%UDK!!dNf+0TkK{Z3I5;nKx`B-3K}tV72On@9d?dnoh+7;v zoF$;|ee_uvC5&J*11-TpivhY3Kk z)xm==cn&@Ur&k%U+e8>IflQJB#s7uQQwLwN9(*L->Bj>zR^Tvle&*=*XMu##Lq1S2 zo#GY;nJ?BIE^zQ67w3u2iw9qEfbzkKZhw(ZHj2Ljye zpzz=UvD`VpZWIBt1v=e0z)4$x+g$?gHstaTl>huWK>p`H_(+gj{NO_-&J&c}vO)X-N+6wK0-Prxc8MGYmsb$;`3^o30-4W( zWIoryhu{PUH=hI9-$>^3K+I=4_)reb{BV|Te-2Rk*H zC$e?B^MEvh;|-D_1VG^>aPXl3D7-|#;VuDA^dNQM^m7Vk0XQK-ESS&@H=q0915uFq zT%4z%?!Wi~dD4T%ce9B@>YQLCye0TW6RE$crFrB#_g4I5e|A z5Q8Kd4v_o6A#)0pEJ5KU07(S!bPx872uPg}xE=z>3&<{z|G@bJlul7CL@M7wiHq~# zBXMqVP`cs(IS_0Ps9*r6D*wv+879yjE(6Ns zoo*r?y>3|i$8qq11d{*2@d61*9#E|ZDr`I5z{v)z(p{j_4Q2;SEyxbAT9_sL;PTuZ zTv)Ild?bnFA8-OX0rm_yU5Flh$PG<*Ag{o~iTU6|S!Dl!(?8TdApZ&;e8}G!CIM0c z@|X-b|A4}e?cgIRB>Nx<0bw6FJWqi$BRCyO9DK+DO@|OhWSfu?)@gAWBk367&X9Bdyr{eUukw>vnUNP+WmQ=MTV;0(6iJ zgxJS?@PRCneW3IU@(;{DP&o$jFU&r${gC_%PM2W+pXhXh*#`@MkbR(V1;u}7m<-4W zki7~>^#R0wP`e1peptGP*$;Nl32?px`xh1-F#p2h17<(WeK7k$=@F0p7hfHGAdh4{ zto(pk4~iy`+hN{^IS*z%xb=eMeXw((!42|0sMtld9?{at$62!<_C`?RBz)Y|QIKfP?`8;4I z*nEC46KuZd!52c!kJv#1ILzmVn9tJufFEi;2bc*qp9jnYn=b%ng3T8JGr{J|9DE_s z872c3fP@ZGdq^DA9JSI#ae#YMNd2#Fn0@vT``DTv*h9U<17?El69O|q_JJ&s0P_So zPc%Q^Klp$N(z53QC5MOhonay%SwCpRCJVjp|+gLh!cyLEw3cNtpJvTF9cNoGcYi8 zx^aLMxCww&fTKs^;0uAyFd3*cG}Pefm%lrlrSpX9e?|s|PPWby2VZbP*lsMKK$kxF zLI}#|0A(cpgAZ6a4}vlXOQ#WHv(9&CPC4{7p-fgAB62Vb!_J_4%;wPrawFE&491ohuQr8Ou_AZEFN zV_o{-3qB|x6ze>ov~s933>5SNhxz3p0 zLDU`Y0^sOJDxVn&|F&>}gO9*HNsuE%Ai;|izT%)56jJ^FpBdpE zesHtDGYsq=v4gKzK<;5?0Jk?e__v(`waP)QKaf_bgD*HBx}mOVdxGsZKfS@or24Q6I5KBQRgd0{Y@_|g? z04EV}XoH51A^Aua8ruIE!TAW>_Y(u94||4#51B#QL_k4s@Zbyn#)qIn8B{Wy>I4@% z5(gg%!g9F`B0quSM+y|o|Cyor``{~RVg(f%kX|*og#eE=aCYP94(EY+TL@$|4>Wx+ zg9hthDOv`ato}2DdxPNK8{Ea9p(r;A{_XDIL11XrCj%-H5#}jAmb5W_sM|V#}AKR36M+vGk{7RXrcnS4<4u-pwI%v zFDw~>!xQE-xP!sNcmmz-BA_Df1qU>>y@n8iC}ugD=EEqM$6o1`0+{e1Z&yI}@BYA(49U1w3%UZWVy``{5~qALJzw zXzw4Mkio?SsLGQ$_(&8Matf&N3wEy@*uC~}_sW1=_Macxh=92l?h6)V_re1kGzJQ4 zV1Rq<@FW5nE(Fy%;FJgV8@L>Vxfh-t!F~gkVlt4SYj7(Vn|tNK?yZNrR}So6dr099 zbFM5{P#)w7$j}hDOoe+5oLynTD-Bl54oxwz;Dvjb6*+h%!D<;nnG=+?yTfI`u3Q6*G4!(8=-P>Q__!&g)Pq2FiQTrU6zXnx*2B}>&fc-phJV84g z(0ac4A^rQkko+>B;{izVF@WPF@bNI%7!FIfJ4+|&<2{3J{A$3*_aN~L2^Y}FCU~4t z>fkG0*gz%=Xvq5_xW@!;V}pn2MGn4ThYntX$J!1;2Rb1gzl*P6n!%$4(vVS0km=yQ zD9^!1;8}jqC;@1w(*fL1hK$v+fkv^A2X#U53>mZmx8*_Otl$waP@@&nEtWa>N&q%y zE6^Pd?!rF=cP_!>tRe?raDWH&K|0((U2`5p|5v>80%W)uY%lU)Hq>55P=^xKfbTp6 z_7!NP8)B_EXkg_LWXu|5Eofwn8MG+iL?>i`7d)s9RsgXAHk-tQG#?4JA2cuzvL7^V z3u1#tvJZl$THu5E-R>-)Zs0>EP}c`ErUoA70;vrX0FAK89DF6z_y{x-4DM%xXZb-R z`VcYDkUo4m6f~!R(!YkxJb>*-OlDj-_zHP40~#3okfj1(|AQj|Gz|iF0XVsUdEgkA zKlln%$$-qm;_DCgmHjWgb9F^ zfX$RR_(}jWm;)N(1FHdzvB1a4n;+GK2gzTtgIM6v(pP+7CMeCkkU98D6wKp+re|<> z3o4~;hPywW4^yb>(NIFQHVn;+Rj>|q13AolQpnIL;!34xiQIC>#>@RbCZ zCjhk))};lfOYoH0Bm3qD_MJHFk%!pB+Wbf!5=^hSz)Y|`{9q=?o>wAZCO9F;9egDT z=81r1i(y?Js>Ojp@Std z2VY4*;ss&=%6bs?&O^Ih0V7BN*?8#sSLotb&@xyY z>tDg;Gl70QJU@d5SCH42f&2IU+zbqzsQr7;I$Tgc!Gq9x0ML4g0bH*DO2iiqz8cu| z8V6qu^m-NgtAyY`x}KHR z>rH92Ue+DF9{13}7c^S$3pbx;>y<$(r@?bQM6Ra>*N@%qG+F080bwr7L7J>j2W@5n zPovXleLdWKYVQXidcOjA87fP!NH1u10kkmyZO;NE{`^7HCf(sY2On~QcCnn`W(Q41 zfmcSJ==KNiTY#)81#j;G%`6=R?W+Lqivg`Xk%3M;L-s8o_O~F+ED4%%PvKn%1h>Eg?S55W6B4t0XJlz{htgoE69@Dazs2Ld3! z@Nlyqd?W#4iGYlfK*-2&y9{y#^apK3LEgUu^`FSWhinHQ@W9Oz0Tq-+_7zi+K_UAF_fLa)B0!gI2}!L(LNb(ITL!Qt*OeNc@1@D+6{fB%qMZ zgN7FiXjRsQ?r@N2AMt>~O8~T^2ec3zw2cO`>joT-GN4T$7dqKMo4LU2uX{y~f>KVm zJIBEXY#^OrGdMvk0nnBuh#CAKz65BC&js+JW6*pf#0*IKa%brkX@#4^0#Xe&2W+AM zc%eCX#|{@rMgnRsbRH1ZoNj-fUJ(YcbC@_UaI^P{oCTQ+(ucH@h!w<_04GYYdq8t@ zNan!$lOCPF3=ep8e%}e&`}WymwMXX#P!k^Bp6~YO=mt3*oO!za4L~acyWK&%pg_BO zm_Zw%kdiMeh%W&OrbFOuC+r};0x0MYb-Sy87u$8aYk=bQP`A6z!3W%kEoUa6B|WFQ z{Vl+2jk^79AT#D-AlD$T7lrP>GeFrtCjr`m2Hp%O0JTg6YKa6WQ9>L8-VCMy$~O?l zfVY@wfHFJ8F=8l=kpivXw*c=?0y{BRR)|5BMQG3CDvEgdj{A$Ab?bgWc{n;1wsZ4Rx^jR?vQ)aE^ly;G6qEAp_o4Cjj-i z2-L?Cpv_s3kO6OrQvr=KYan#$aEB{E1J3}e)dZ^50;<&pyqgY|=V0l)8&u?zxIfSU zWVsAzeZ$4>a0O6qxY!-80@_=4u{&G?l&dawhwB`CB#ykFt<#UC+ux*D1e_Huph?~4 z;3IC(G&H0f2Jb%txrfO8g&H7tNPrgjTmJB#n?`p6>$k}ka8-V2?d&VFOJ`wgq z>ldDb53$vs;CfjGyleqnFY|(OtpsRS*(uN}Eb#uRZhr;vA}0;dBC8YK{wko&E8yA# zyl>$|x4#b5O%Itt+6}-f;=27!z?wUnr3}pL<4RnSOrQ~H0j)A@4uUt2x)P4lF zmmZ0Me8|Hs4q7S#YSxH>DxHVW?RAiCFQE1kDE<_{D@VHhWx(kNVQsg+3OMQLfS2=t ziou7V6=N3An=~lhp^pUGSHj{3P8|!K4_O0$YtP!2-?Ni?GJIJ2`J3b_eOzIC#d?g0h@xl z{|oLuV%p0h;Djdw32X^a^nueIto5e=j$Rc|x&%7`wyjbFy!y`o6#k&p2?{^3K1l1V z+usDd^T+};)DP14Pz*FEF9Hr%=7TS|K;a77b|eE@{|I+4xc~4#2GjxtZS-U7b_efo z1sMQZ+y&m}4RJ4ce=FER@V=rGpmYk_N2&sH&?#s-1?z)kjBbC3gLOa|2dod4eht75 zHGz8&yl)A#;2m5xf*V0Lpu+tEc!eu?e-@}P1TSJGf4?lqy`ZJi3ZMW2y9b-P;gT~ z-e2iX%6?pMqey_;T>%_UI-s7Xy9U@!6ENGL(@g?g(+hCBTY!hSL6)J^A5`xTgR+SN zIGhj(6P&S6f#X>KR1d@UMZ*%Z0o2W)(+WBdf;KsVT5^zZ0(DzofEJrU_T6@dDS#G} z$$0d-sescbc)Z}k!B>0-9|(b3`k=*iu>G~L_z?i-BN0$K0>=+5eMx{bkPIl@f#T-{ zBt3$2j{>Nh1>PqJiFW}|!xKDxg|uH>2RvD204{$)>S5``1RU-bkeRoG2Or8Dd;xM+ zm<%|N@qsd!0@wpc`*$fGPXQ-e9#He=A!xlFeETh^cmu8a)c|iQ1(m}vZ;Et>>wugM z*^US<(j-6)MNk0;iUx5|VQcLmN<;DnZ#@Em*rJ{&>Cqt{L2Fh6vEE@=Gh(7_kn2Oo)o_M3y2T8e>pnOubJ`(^>J6h<5y zAOfz>B;dsbID8@N^<_X#18+kH6@1`T&){qUPKL03k|N#i8X$F`ZI_@R*8!(|15hyo zRtNUl39vs*K=C37Qe<)P1wYgyu^i0g4xpmvkWc15|&}V7wD#i$J%#0?6Uu zus#J!XrL{#;D#fL!zH@i!EprhpaD2dfy;G}I#A#K;47w1H-&C@3y^u>^7B+@m=)4JV<<5EVfjH6zRLK5WP_+bV4}tfCgUo#e3pkE$$i8AoM1b=*co8yq zKRBd*0(k{=Xa&f6a5jciGb*5@1lm>&D%Lc>*;)sb-ND5LIQ~w7{Q^pI55eoP6}sKQ zk;HxQ75MBGi-Rvjp*{hhjR0-ubcX4GLJ;IF1JwKjiZ4)m1Id4o@C6rQuoepFhyb`s zy!+3=?P>6YE!b30Wg!7ga4>UaK!x%n@c9`goo)&Ua}7G(RG@JJQ>y`v6L7xK@#u9k z0C%tv?QO_>2o2{aK%piAO?R*nH5G7%(EzoRK=B1B-og0=oa|uHuFxG0jwkRilL^FY zpmCyCpiMdl55kLokQYJbfZDU2VFn-@L0&aM{}_M@2=HbR zmF{qp&Wi^h3V}+w69-@Mb-HOl;}X2z{Zwa|4md8s4QT_9UN;kX{$f4&KpM$^Fn7WH z2dOZ?l_%JLF!Nv=yuthTPk=iDAg_V^i;`hPy2HW#17|eQ^8apku>W8YCDR?Q0g3=v zI4VHP4{=ZtdkR!hfr^F_ySOvV1Qd@TZ(3mHH(Jk6g7csR zC|`kX0_8y&XdVQI0i;R-SH|G9bOM|>L2(J%+amyKpKCy)lZW%f!B--kZaUDogru#` zFavO0g3l^2@#uB4fY)cBL*>ix&}Sx&@tA0Mai3jaRUnAQcukUZp`TV@SGH02MWmf)HMxs&u=9yb2l% zKlp;BGYslgXnzplRai0vc@^YLP;%)EvjD{_$jdf}{7HxTc95+S&~gY?Hp+nF`QpJ> zu<%iU#xX2^s6ZNA;N~wVGlG51aqyKG=PB@>dL2j{zmft~F(*3R44`og8KLM5GXcjj zc(bF0N3WX=G{1reZjiDoxczYoQeA+;30zMh6-VG|5i%YD-c|x?H-NkYI}QSzJRtR- z3`jq?`3}jZ;OZWhWMSR~C!Z7G>KojSfyEQdyP(MdxOYVkzTyUXmj_(@fJ&%Wpm+q? z5A!ahnE>%F$bL}r>kP92DFJ!g0V)4_^tw45{gv;o06q}H0CXS(_<#t0@Tm|6j7J`q zO8A2ghY;igiTHt-r+7|J&=E11n9g6#!fd-=6=BrGRjQ`v|pYB%mh`XFF3(W z6_6+wn5h9`a)X&VASMr(X#isKf|(|ul7j!>3qII66gHi17T|LzAnv8;{D~K!#X?XA z^MMYbfH+tH%mh1F2+RaKSOm-jJ6H_N1Upy)%mh1F3d{sMSnl8pna(g9umHrhX%pb* zGf;G-#S75-DX5c0LFZaPoGbxmf}9LmYbXuosen#gU<4f;aH#Xd#TT7n8em}^&`}30 zV5R|x2|gXe&7||hLD1HHF3=jY2kf1G&~OG#c0txRLBgBD^D@9KM3#duBp^p-fDcxH zG_ycQknnf9frA=y1P3^%AxCh4gBo%K2RNu9M{t0H8gc}O4cI(TP=g!{Sx*KzUjcNC z2KaOb*!deLKxa)nkneQk0Qnwv{>F)eFJR|yoH+Or>HLig%@5c?VF*5afw|L9qT8PX zWF{osAxu!Xzu*Ong556!W`f)gI-Wrs%+rC!2Kf97P;3~0g+bvC$~W={Uvf7;l<#!2 z==8GzpB4c<9|E+d5bS=~c^)S~XQqIT_5iydcAf{s(XjJ8EZ558amCmj1uH*mt?04o6}96m4;>;ce8AqQVd zL9!-KryscffrUTle2^0dK_vqCe2^2M%nUjz1mymgNauqV9yF&0idDAGFcq*O4G@nP%mg|Ar8GF2>%sL2 z4`@9jGid!X=w!cFOb1_ZcKVq#Kd1*SEp-D$fi~4n0Z0?9>!+78Zrdn1DwR zz)RAZzze0>IZr{P!Yn$$u>jhw51MbM+4^emZV>Pyb8ywg20lBb6P!Un$Akz#`3m5D z$lzcEZ?gqOA9&?7c*+7Lr zg_j6&eGFO04_c%SUdJy6IamY42X*bir#FCCw1b*6kON6Ta^To~g?u~YhgSH5O1!TYjJk1Z;Awz`VE-)w>L6YFo8FWenA7m9gs3ju+IwgZ2 zG(ZFDW`N8FooNA59cBRH@gIBv+EWJ>FoAA12AvJj2wqJOJ`4oboN)&ofq>{ggX0^t z&>ys({S|fh5rFF<(4ZG&rvj*4fwI75A(CY((Dca<3XfBrVW4;h$2!PmVLG7TL*BGt z06jtg+#huVMe_^D$u;1f3Mh^xI>T(h=TAW57qpWC9KST&rvcsH4;~DKBx_Od*&-mj zK|N(~95O=&MZ>_!8k9ajvS9DNkbxWx0~XK$$06*%3vjLhrR-N?;PaC}O59AKyT?K6 z)xjG}4nE|9WM$CNx8Ncbo}WR-;eg{;27H7LhzpuAV?i9o0UE4?A8Y|Sj02# zpq8f;_(&p<5>Q$P@81U@FgSPg=>J#vOA<$tu;P|K6`8%+<0`K?%m2jZE#HsopWHV?$7##cc(Bus+*2N%a zAAuBriggi?zvQ9*0vGG7P~AG2s z1FqRY)$A)-aGC`hV*v6n=vW(2Dh18ScZPvm6!MUI3Zy2?f`5Cs0%Qsf6d)?__yF%G zrQ?1@P=G+s_yL`q3rdV^8lThqyip;0oUH3X?Hhp(gYn~#LNJ0 zRLQ_Y@D+F%=2R!RN&*#UpmGFsE)!@R0X+T&J^!LJ%my-)2JU3497bL*McF=5@Q^%c zZ^;XkBa}dg;efV#{byzX#Q~@o0_`|!ejpE8QwVCmgK{ftryuB`8_>Z_GN2R=ic)q^ ztaE_oKO{OKVIv1=A%ML8cQFJfLpDOTmK=_`!{AP%D9@(@zI{ zOcS_CtN;%&@PW9H#3Bn#bD$%iK#4^Kw7BR3q(Ij|-Ty|1^PIpnH|X>aaH3|2Wz^yv);3BB}dIjpCf`Sipl9UN_ zBnx_)3pj;L<(pk88oFK z1IlCdpcNI+)(t3sfR4TcYtw;_W5Q330SB4^c$6BP^T5ZefeRIB=;>&nv$8$O1uq)8wITp}>AGBNojrK!Esln%c$%BvC1{W!yook>NBlCPM@Lpf=suxI4 z0K75we?0>@n}HJy{QN0!qE`UdDBzd}&%G*iLK=*02Va1XDgx=$02OK?ko*HKD?m92 z+y#6EIbRaA81f(}RQwD;iH{kwJPcCWNJ0yEkU^kSqyt?h2CADtGi)gRgM+U?#Uc&P z|AKfQlsG_xy`YvL)FM#%A_F_^3A~G0r8`^!RFQ*D8q)zSwFDhp4lZrst^R{Xgc^%0I4-(fL@*s__-h5;Sw;13WA26WuOUw5xjl?ydqEmx}W+#=rA*A#ly~U z@Dba=7gC_bUn&P*3PS^g2i(g7ueSxqzYZw=}{QOE$orba~i;d+gx88hC;cdMX!qvK-WF1x-S|5Cs+b@(iF+EKvE(za6v)8N3i0 zl-Q8Q1HtX%fjBP?TsX;qn=If1?!oOJs6#;I6L@9{ewZ4#|EvNkfxd6OgfA z3-t6Q3p!6v9<=5feBK{4#39Q7LBm+!;|<{X8=Of&=jnkmDLm_f?q&dW_`!!H!W$ys zCO_yh6XoBM8F*bP-_@`4ly)YNOZf)z`OxIh*1XIwgI132@XdEXsd?#;6v$y zufQh~g4>Vs&?@P_Jh;;Zo{xtf{su`pI-rbr@Zbyhd2Zk^H-P3s_yKn8pz_lMvMA;S z{4h{(NdcX-lSRRY972K+HjW~6@D(2@r}BVv zDmdKWMIpFI(&%>Afkg?p5uyVPGf>F|O;!fb00Sj9&}987@NB3Bs9pdabqp@w!1n`y z$~PON^arjF2HClHAXmfV1)Lsaz_kIW%mJO>2yRg*z%#2o!@(Dzl^&pZanO(ysLnii z5K%;dV@m@vm5w;-lN*%IbRa7U5a|qD)fqsi)!{JXZN9=h( zR)TwMuOLUCfcyC(U{P@I6>{_mD4jzb0;=dC^T?E+djdL026X%^$a$im!%x5i(4Yfl z*bzfF(qIV{=v*BmsE7x(Wk54K;K_Q>*dAz-oW0WxG~f=JPi2P=GxLI`w4v8rz~}8D z;Y`taDz8A7b96$+&iO!xszBT=0A_+lia{P@hBjYCz@p&576UUu%`T81nIXe&ucW}D z;J}tU_(}%S4~CrXVgXvO1P*6P&S!ZAx~v51Tz1eQEfD8&fSF+Da)O!Qw7~^tf}P6^ zW`do|17?EL1}~Tib}s+HSA4K@Uu-(VKot$kVU*3DKAi_(l+zslI zgLlW8fO_|bK>G|HfcFX8fQFk7b-O!2x4yc7x4(D0dmMZq1nT%7>UQ@z_&^x3WjF*p z9MtU}0p5Jr?H>c_#^Kt(EP%3q8G8Tch3;?-&^mRIgO5bO6Ib2-;2S_eE8Iato$x(^ z;7yD|oF_mXPaW`vzwU6zCQ9(;Q}FnO1$ZAjc)_6@NSz0$Iqq`s5esNt5Xkx5;SSK9 z+5w;*>!EJ<5YX1*L*4EXprNiq-R?1<(R}zGeQGxFAd$$Ab@eAxsCygAW8COc%$44}=dsl!Wj+ z91lJabKyJ$ntTWVZ|p|eA{x;dhI~Xs40IG2vWpn0e1rHOu}d1f=^LDaXtuw)+ua4c z6x0J@s}DHtz}5yp6KDu@Z*~M|GcY9Xz_%_zeE{Dd4&KiTN>`7-=PSVW8-tH85CHFY z1nmfYBni5J6m);()|b^MLam z_(tFua6aq|gY4lQEc?Z~{Vl+8;sI{tfe%IC1eI3ccsv1)M{&q0BH;DQApe555k^24 zj|YGkHFW!jfG!mUZzq7QQx@nBj{(gi!gkss=QnVArAxim9qs~luLr_5aD+jQR|x@c z(TqUI#c;a^K=-XfHZeoWM`-#R1pC#y{cXV8Ra`*JoK8UV8+h9aWIcGdzX#Y)A)xpK zm8#Hu833x^!0`iKcMe*71?d|{?$bcoz0Pxns2=EFLaH2#z(?_B^JO+HG1Z>|eB!1!hlLyCsdT=BQ zbcefu_CdX1L*z%0S0VcX!0{~7?H&Lg9Pak_0iAyUj$c@@76aaX6#|~K0!OtFr1S^v zI_dON02Q=|{kq_Dt}cL%lmndrBLT7z)Yt^=IRzak0J$()0CF)c%mNi~1qN!%K+8+A z_veG-TmZCv9KP2dvP-`^+yioa>H~>`uQ))i0%u-`vjV_dutLD)I-)!Wk4S@ZG^ni+ z1Friz-9Q^+RglKx;PDOK@Bc^&_5NniVA#cjuORzvVE03V_WPd#Rk7ep+u`SJfM=aJ zz;kV&QOt`6UxD^Dfn2HqE_cBD{ZE0<>i~@h!;hB&?e_<5PY3V!2jySLoyj)f^y_l) zA@jjk;LGB{*@1V$}hi-o#aI^=2 z>UvOMz5;KCInnJO0^a`;0nQ|#Gy;xO$R5@haBbh|2HNbY1Mcx6=5yiaD}c`{cp%4# zaz8m}RRdB)!p~O#4R#`(^C1InCVaqcE9hB&HcY(CSVCe+f&H^VB8E|O{J;no^d`^Iqi9)w~ z2)Nn?g$TGffrbdUqXXJnjdeRLQhSW1=OJ{vyMWJ0Z~(`h518!%-g*}TW(R<$)ndTx zh)y@qCQ#69JX-q#a(_7g!3Tn%^9MjDT*J;E0B>wM1uE7+vpyoA3^a0UalAi)(cVxu^$!qouBn+_=Lf#VGlqTv0IBA|Kl29QghApy>>h4i0#+3OK?*oqTXTpa5eU*6>v7n&TF?N*32-u!0i7Z8kn7+p(46GKgD=1t9I|BQFk<^vH> zegjP&!SWlVy#Q`3fTvAi2^_Rp9(3*%s5S>#2n!g+*BCS-46Rr| z=9zT6M}W)&Z_5QOZvf>>$X$3gpbezR`%fU}9S`;MGQfRN(8}OfV4t`)qi2;Kn>?c@g}um`>1&5OAi11Qs|`T7bd}ytf0|=I9Iq6@{REN^UOb?U`Y6J_jh> zXkey8o$hcCaDNz-O=0Dz0W=$f$}7+^R1;`61_w2yX=nk?#^7rB;0tj2J=Gaz(;XfI zG6z)tc80lttaR|`b@RYXkKmI#z=sNhZ?c4B3~)I&c+LX>r#aAk-2(~GI2GtNYJ+Zf zaGHa;*96o|gq$-H0^0Nm?goOC2XwmGKr<;!tph00fz)=oxqt%&R$qeZt67t z2}q`O0B2fIgC5=o>kM;&W?E4DuQSXC6q+6$y>0=}^_Y zUJ(fHZ$Qr703XBv^Bd^U4Upd<2@2F-0Ih)sH}nwaT!JfE@CqYv`2?w7Z9r89tXwnb z4o7P5fvXA^kUZ@84-3#B%0`alMJz#h%cYXO&=pgohXz`g?2)fUil6Xq|Q z?r?CQ1LiM>ZubC?yJ2Oe3p7Mw{_=o^D9m3z;Ib5aHcS9GB;o5-KnpJ}fKGIKAd3{9 z-~<3!Y6rgb6&6#l@C3U9=>QQ>1qO;H*a0H2bSDDdPjK)RIDvrHl!KRj9ee?E7wiBL z@cA$&K0x23m zAq-2`;AX(7&M;6YgAQDh204?p(+w2V5uITnDEp}(2Q*E?BGm_)reSF>09s0bN{!C22vEF-c=Wo(U}=wzg!6Gg z{}i}2`mOXv)XL2AE(M2`N-_g4TP!~;H`#}9O&v%3TMI3Cae zJ+%_<0pN3cKnr`sLBehUj7J{lN`!+B>k$+KiG+ce<)HOa;-J%07&`~wc|9*ci=`lq z+86xbQ+#Yd2l@zrnGPVPAeiX_VhVwo9w4SLnCSyzih!8`Af_mo83M{d;s;-d!Oj7S z0SB@>==w0|{wZ*H4AS#}z>O-FgD+$tM*@Ljj=9qf92QJqA#hl*fSKU1U;{J3VZi}r zg2RFf%mjx8ALtmEN6a9BFlbnCfHFK{JRNra3h2C{6X0v{*gM@=KnJ3*ffp2l&R+qY z$i&|1#sd=N0gDQNnEYU-2#5(gZ|KCq7qIh&PJjcED8=DF)$MxJd$80 zD0o0ez{r4k0iY9hm_f^$KxY}T9(=*n{E)pfECeL!2Mr>$`x-%~27yi=06C5y6gU?S zzJMG)1iBd$bUYM4C~z(ud;vLpNC1=|A%_ovk4ljNk066~i-Jr7HIu+J=nM&{k>K-y zAVz`)A)rQr`i4*=!D~4oMuJ;DpmU?T!#Th~%nuG3u#w=T!UAT3!vbO?I4mGXB!R*L zY9u(RK#T+@703}uF<{LR;IoB5_r<}_qml;|AqQW;&PTccT1*N#AL+tDQ2v#N1_A7R zqzm9kASUd5q!R~0XIRODHiVrx2s-}< zc0Lj$^?=X+x%i^f&!F3%2W$r@SinalfkWfq3vRF|ES<Kt+3Em#O!3mtE z>_M66!a=0Nmk_75fH#qV7pO9UZ=M7%P~`wK!3$LRz)a8rRnQfV_6J``f&0Ncoo+Uu z^2Ht+FpzVjKmh|veD zL3X*tbcTV_59mN12L{OcVyfKJ2?~^YNH60B*2A5kfnpDCXF@lef=+1yA5Zd%7t90& z59oXZ5irjHdW;P-WT+H;9LEbL@Ig?Z#u%vFs|N=ZI2!69VFd~baj+WDvOLfhGI>Y? zJPb7T`HB^6vQOtB&@GdX>N~>%KmjCm@RcxFA_ROg%EedB4;h;u)q_vAi2=1mKtUz} zI;R3}n&CI|6DZaU&$4G$(o#hU`py0k?uya5Y2cQZK ze54M@7RVVTQU_nq?mR4TD@g==d3OggLx5DmVvS=%!?kNeYe|AK(i_+ zjKL@KfHXp~DDsucA<&LEXonfnmCEp(ia3u39KN7)t3c@)biNT}^Li&}r8f(xZ~&ja zMb>@NFPNYQXo0GPR}u$72e5%`0-bpRKDG#a`qnAP<`8hQ1<8U7z88{^vs=Id8ql-~ zK7R`|Oa*G6ut3cQw=DP%zLJ9SO`r)?9dGs15;DT7po%4|G>}J!oS86r^N)1v*3mY!tZskcFH`2NsCn-v+vyh#9mV zD#oMN9ejTQIJ`lpYJtNW=?E_H{qU6C8xC%#(fzzH(6ojGxYYvQ_71INkbLF>4K~=} zV&KvObbcBr7{Wj)8FaJ*=#(n3Kmhcp0q_Kr8@Q~1+=w0nH6K*ffR|i>&NCY3=Zk?( zor3hg!3i03-W2Hcu`qB#mW3o_uz&+N$3qU^fUMwf1Epoq={H>9~DNMQvQZ~`ZY22laU z1Nc-oXfg6a1ad43*o+Y9ap<6(zYvQ+YpPB_#sYLe!HyWu1(zqFBgnwvFL&?-KlIcx zNO{uu5Eg5YBK#mItAUO|1Ze?nH|OXCZI}livjlc5sPhMEVsJx^5dk+b5T~nwZrlL* zSpsqvC)gMnXaNmAUJZ0EG-wz@?BEM_s2Sk%$_|3Jpo52!z|DP7nGDvg11*Bt5552o zM1ju11{ol95Og9VNC{|;8hn5cBWPC(WWAIH^f(TXJR~>BLvjOHzyW+PlsyCVxG9xR zHy2PJNEUqj9!LeKe*i8{K-WNm51^6;om1)O19fr$coQ-#8bJ%6z#WBGpd+!8Azzn)k3N$?cig;n@IRT)<**LnxIY7te z@q;EHK^LHa;!X4*=-5?|e$Xv6;ELWJyi~jsaxjKCB&CBCfR>vMlJns}l@9pqBuKN{ z26{>YBV-LQB%cdG&J6U|AV_s2FGoA|yiO4!#2Q62K?LNet}ybl|qJ z3hb~2aDfEsmOzh#0ngOH4m|!20<}K^K5s!7bnFv51N8hIa8Ck!!~nQkA_?wHfX=4_U4q5|K5Pyg z;_RR`cRZb8JfLheWX{I}-|_<*e*tyrK*b-(XQ1-t;=z{!(1Wiuz=I6nGsFcTJG#UU zzF_Y3a{!-j11^+wz~R6MYC(YO08k?kR2OkVgAz2GW&o~>>OpG2Q6B(46a(Dp6g>C} zw8$M?2)_`87D1r1`atEd1@sP2P;}XVq6=v~F*yG~kIe(+AJ9P@;A_Vqxd-{wJaFy- zmjR&mJ*YSXXBhBM5om)#m;fl^B%t{XbZVXmxZ*qbiU-{C1|2T~j%d(weQ>Yx71UzT zEGLc5>jUQ)4Oo%}+YHVzFq<7f#XYFo2rh=e`32d}9^K&vC?^H_K*O9LTJV9Jg~)ym zflj_cPtWNLivcZ#g*Hwgp^v;?8XW$k=LLd~8MNpQ*MJ2!xVLEoUXTlJ7=aTQxHTyS zttCJw47z}83DCjB4?!o{g8HwO0}?z9kKB+4+Y$hdAXpj74r-u4ZUYMGhD?(~ zPAvWpI{itg6WnqEElv%K0I#J3tGtK_fXM>O4hI698I_gZ9CI)+B*XV5|ohY2aNi z;EW~*IRF}b4x|g{gf`IVC@5Pz0$2VXpa{DF?Idu6wkO+k`uTwJ9Xn_>`a|#`Vc_CX z3Nq*lHYfzTW(t1F5NLhBTLk|$(AE=v&O?yaod@hZ15p1Neu5&neuADo1zP3~+C;(u zYR`dAUF7Kw=K*;Dau7aT52%eT06UKltVslP4CuB~2SJ)-y2B-)njnL4po6v;!22Ai zdcGsL7BYa^1Rl`*|DOTeTr&Z=?IFj(R}!EzvIM}3Yb+psf-3g_ucQGFIKz&g4Cr=u z0Ohf{!+H03VVCZgPo(mP&ye4ARhI40Kwh2WST=_|#>72JoQ=-Qf-gUy5|Pf!o@= z;Grha_Vt6HHCKKCpzLA~*}x4c+vK6WKag1w-QgaflIuUXn+~ape84e^nBNDNZ=>g3 zLRPhbZZkwb?-JBrI|R9Y0(2xOsCoh~jsvwu!RH`?3m_M8J#+8{{0KpC#pcl+?f}YG z55Wh5`hcSBAUN*9dtzZjQ!=pVXYLFOfi0Z_o#kc^IW)M_ErNeL=y($@Sn&p09E(`5 za`4sY`J4ZlA>#@r;2tQ_;dbCos6}_U39MlV+Oq|!vcMOkazn;mK_%s>PEgd^fQDP_ zK__59+8hp`490lyq13@w;KMRpIw36&8OTt@fA-EWA5hf>Z6QDxyLxoGftF7|yVamm zW)FgvK>C3!kbm#@(6TvBdE&;Ka~@9Y9kA{%QYy@2L)Gt7T`7<_=HVx z{|g!9v6)6V_u8 z2Cas0ffl9p2Oo-ox7#0l!46*N4&IUgil>7QB|!>(K>6+<_?jhio+(FPOR*)tosQc!?0BKK-o~KIud8(i&Jp?ci>qb zJVqM=-NFMusuWxnf;xSWql+MoqL{<*^O(Txq0#eRLD_+w0W>WJy6>bLeDm=ucp!o^ zD~|JCyTff@d72-T18u;Is2icpZ-1!UowV~W zWk6aX=U;-(m3klo;)fi3APS-*4n7bA(J`Ro01tJ$CqPdmOgZ>K5+t5+@PQPF&N=u% z8gYJR$-#%Rp!u|lgAe6EbPePXK-!+4+3gfxuI&3Edadc+{ zs5UzRJ_(c$B$WW_^_}SUPXQkR2(mW^dUR(2_#C{&8DmWf|AOT^jI39c;4Plyq+&9qYZ+3?#K+g+H0nZ3_hi8D# zb?FYzIrvBrk0X}*atkneS8Vjga8>m(n z@ajtsgkJE`WDsiupw@yC4#?ODsNNW;-W2fBP#FllIo$3EP+BGj|9v)_=p`8ss+#xD}kn)3TV!&IrvB#lujTgZSu=Az`|#!ogWH1EWnMY+dTss zS}~w$tW(|o;6mpVxX=N$L6P>s3xIPGIFt&YxvT`5yDFgRsRo>zV5f3I&ToLGAAy4p z*+HiRpXl}nryo$W7k)Aj=pZ~cP@MxhNCgW!{M-5|$fg3=STyx{;H(+ewaXmc(%XtO9N36OezX?J)E_$a{)*x9Ds z?g`+u0d{5y_~^t6m|S;w4Yxa_zJi^LiF!UQG<^)C^G`upD@+7b7eP)%hyf)GaQXpn zb%tzp!GV0L6Lb}13HThw?(hm|X^#XH>~ zhg3q!C-`}c;PY7@h=EEH9&UC>+z5cGG4P^PaD@Rr76EipC-@{D1@K^8x4#UiS_3-` zZ1oAyf+^6-FCA$0rvY~w#scK@6W#tMu=Mdz3S_wrxNrfV&k7mR2iJRG z`@!YfDe!4yBH;7Yz~{Gu&u0bO53WWb=M_kRPGf%vUU3!yYCnSY!wMvs?(hs~!3Akw zf(9&*YVREI1aS$pyet5ft04UkOCxIrvBvR3Ac4x&)<1czALg z9G>UBg2OHaw9pmFhu}sQ=+w4u{{rxNk|m(VJ1EdzfFtD;_^=gFsR#`<5m2aMJzo-b zUMl$f*auRm=YxXwUL8F60&*q-sM-WogrM_dPk^gU&{EWk2Va5H3Fz>Da9a?3tR&be zkaJzY^(POg75b0~l;Ct==?{8NAE+4yT2%l(KNh6#A@~A6n@&)n20lOb1-KmuPM6?x z2ToTfKda_ow-6fpAr_n zL*P7G(CiOn9UfJ3TRDM1MZK&PJslKw}a}uTu|@pfdIFA0@yP-;B#~{z~@GlfY}9|ZWi6{HDGoH z_-Z6KwENZ&=f{rp^L4>3ARTZs5;R2#Ke4gfKLs4R8Q|&}lryD3Wn~Wd#Iu5f54jJ% z0Im2y>Rgq8L%jmjOhL-|;92V$=<)oZV+dWq>p#Q6=hI#|_-Y8A-wW>1fR3erINlw! ztQB3pCSf#C8J7XG{k9|(iay9F%}ft_~?so{{$mj+!~39Asnl|A@)Gtd!9pfn)^K2{oh zJ}|gwM{1u6fUhxu7BrBm09?O=FGdBgD?R~gr-JHp*s*co?lS0nV9<@fHU}Sa9ee>= zUV0I9ni%K=YH*_w{Tx>n@cEgbUEdcEz5+)9Fi_J}@|_VeOX$a9Rf!tYCjZ+B@J>o&m}bd?3%~9DE@JPU8ij;UrKDf@2C2BM#l} z6`*51!N!790OZ)O8tCn$pcDRJ=OvE#^M}E01QXDaat}G6J*gD%eqYcs*asrq?g^l? zjcvN!b2^bC)1f=O02J!5d{Y9RKCJ+CdcdI$PK=Og^ct|Koo=90{{p}>Oo;V=@bZZJ z-~&-megN$rf#e71co4WV20s277Qyi2pP}gsmcDtwdt?v30#~e%_OSqX*a!7=X_=2CIW5Kpp5rG9>+jOL^G&#$a`j><>=-5ul(2pX&vhF#sq11aN;76a}!L zwE+#)Tm+T-CqQusT8or(@F5fUW@(T*NPiZ*Bo`c2;GQldgLrhiSAhD!AZ6fe3)*G? zF0sJZJb;e6Lq4AoeCjabyjrk9r@H-lAaMgKE5R8R>@d&*X7DEcgD=1yfOM!qaRW*l zV0GXCIRWkngRgl82OMYw8C*tbfC3n-4jd7X0TmsPOJN0{0l33q0&01I)IF2~&EGSD zV+Xvf4xCUR5n}`G$%5hp)Tsme1ay9#4)pvwh+n|%Ub4?mhORS60J#z7mlW_)lMI+& zK!+WIa|kFCJrp|l3Vg*>!NC`N;B`abb~wl{;B)0c=J|BHSAfie`6U1xJ6xdp1avMQ zXgLt1J^}fP^WXz%Ir>Su!L576~jkbWJwpLhx!H=vcQ55c!1D}WZT9t5>pUh#m+ZHt31 zM8W+&8+ft-r+dgn1nBh(&CX{AW$6U)3U+Wk1ImA}@eh~o@C;D)1Lq6Kb{tToL64UN z^_)PP*-GH48C(pU=yVH!#uy}(g4cG{fbS^-Z|rshE%E`~p8$%lk$nC$sL*l)pIrbB z1yEpV7;CuneXPsdY;C#UW$`>)9(+M%^ z7f^V!AABH*ls{lG1Up|Aav&n8-31!h1K;ZbN)Ygi5qQ8`*kKJEaJeM|%0J+Y0Ld5% zpcDf>2Ng0vDADb%0n0z&;trB8!0j(^wu6j|7=YR{;Cx{M&J-4)dAJv#i~}F916L;C zHKwrgTnD@$@!%_P5P-JRgI6O$F4+bx>IWUm4W2&(?F@jA7lA7g@R6LbBE<&U9)p}0 z32u)$fX5X1a9U*Ryl%W5>i@qhQ)wmk{uM22_C&}DbW53DE`uN>K$2R++P8mpAD|*VX4spx<(Y1yaITIIQT*sREMR2 zm!t-OXL_O65r9)CB){c=oC~X7BcL$~OJOn47zLMNCpz5{pfL(b^_^iU;232E#b|~{ zuUigMegutIBE=^xjlkj)Yy(n$1m`PAJ3|EIB5?dd@*%iG3rW-9#^3NhuNz$VfwyLW zLl9K=g+S`Qhu|_KqB|TEWe-6IB7w>f9&kPb=R;T+CO~5i7KSO%Sc8RO1~}HhXM^W> z^tu&btq++Gj-C&G@D(_rL(WkK4U9ztb%P8q<*Cq%$lB9MjXU zM+9VxoDFoO+yy#90rn}V{-ou3goNVhvUf(1Zs6#%z8!LbYTYYa4YVSWXznE-_q zERm%^V;3o(f@2rv*Bofk=buBO9Dcn)45;|Nc<>dtoPboA;BE^z z3LzO5oS`AtAAmC~xafvOX-;=|NiPq074^YavYfj^T!$3!{`-6{Q4+9;+UMk@aK7!p3 zbOgJ50_enbKhPQMwG!?H;G@|=YqsS@UPX{&kB09YQG%X2yV+5@2Qmh$#tXf|dxpkODI^K%&xMW)6rc17;S0n6hAI z38)VvfAED|XIKqbpu(dUeCRuPKNslqbI|E-pp$)>q33r)4p9f6-wioLod;wZ|fD6!`F?|H0ue(HRB`d{8?FT0ho9P6uZPpK1>-Px!!0Q2hw9S`^IV zfgWec4jRWf)d@b%?G-clRC^InWOIR;5+J4^m?;B_clm>_B)~ic@Hqy}4?*YigVt4n zDqJ?O!QkqW7t91zm#?G`g6h$F@B|+C{7dlp{Gk2`$ZRICX`nNuUU7n%ptF2m34ocf zbBV;kJQwisa^T~^m_V&T&_S}G8!YNU5pn@^HOhl}={(uNSmm5?lk0-X8!83h^h0Fa9YI>SH_2tLCXq!d!{3m<$T1C!1Hj}wD49QXiA zaGGa@+)v;JPW9s83E3K`5GdWNfVyCa^$OtlWgfk+0Cc$+Be5~-kb%t65piBvwwr3c_KFcr{|Im`z^ zBjVt}e{iHSLCpq7su<|#=dc>65GYbL;Qa$|e1eYi2j$n%`w>9%vY@O73K7Vm$1(?B zkb6G@=*(r%X~p2G1{BLLIG{HzfLqDJP)-VXwhej&`71fd><%QJL8r%poDv3VJG_tw z9WwOjKUkmuyoLvI$^v)+7$pCSg4ZzDKurckwGNj23%Z8@9N$Q{7(mWb2Um^Ya)kp_ zhk{ZaNHOG0Um3^=_8&YL!O@5uj0K>>96@OWTr7ZcD=3YCPT74W(&+|@KLb#29&sN2=zS2N1OLHu zzcS!6OqfB(DT8wuDDP5y9|Y_`GA2-i6kJSmgU|c}x0}FcbIXH@TF`NiouIlGa+s|= z0|TV;1?5p8$W;=ZVHx1ze#qIykUR=%Z-TBJ;(?l30G?KK z3Ctu=>k8fXy^M|av zi5lJRBH**Uz%?1@b}w)(2|9QNH1lqNXpe)B3j&Y3L6)`3fREV+*#o+{32GeZ1|eim zyMU`oNGS*^ia_fvK@EP1gWy&WNH=K47+ghyucH7hQvwY}ihvI>2g!pj3;@R~Gh}Hr zs7C=g5D+9A2JTgW?rs6|5|CmPw9*iAvlM8c1KHCVptC4JXHtVb4bJmo2Vws13@ZRH zlmM5sU{8Z`{VSn^psVd_p#H4z=yeC}I05(nAXhek%j*$w|3xS0QX}vYVfCPF1?oP6 zvK1&2zyc27RxIQIXh^mK=X~VLFJLt)s2~Mxx(1yRc<3PLVmZ)F0AN!BkSb8{4y;b_ zT|A&$OhEE(5wL^B*$=*w1g&oYrGyvakaz^C2ulDTlnlheMga&Hdkst?ct8Vhh84ZeN{v}?x=G!P6bz`3Eu zTz9l?^C!pr02GP9LN9iKzYu zU3UUn5f850U^l9Ok2^-bQH2Wki9pYG1#PwkH#2=4Z6?d5qP~-0H|Seq4@#x!3Tl|U-E-T&tHjx z>N7vkah~i9ouFFP0^A)g0K=m>Q=%8%Sg(nH1<9tEm8=yAgBk<`E4xOO7 z6?`qV5X4Xs(A~lr(A!sXz{`KZR}Fz%t>DBW1wH&8WMoNqIQU#r=&^?2!~!mjK#9c% zqknu5cC88MykD?m!N!7bBSOAgWb{6j6W|MHVT~N{(sl3wuUPL>0nZoOz?Rj*v?JSG z0$o82J+~4Z?O7T@~mMa6RPY4eKQ>+lc|6L%pP6~1iLQ=+7N)|V`wKf0h;}pAqUKWhf6{2 zbWpz)tStkY{h{gL;6re0JqK|zG5F+CaPkoYcRRqkO2FsY!iEFDcbIWQMr1-jBQpH* zkp4LMyjs}ZDPrB>ETF^zw+Y<8V}V|L4ZAu8)-4A+5OfC$=;{lQHfUuA^_KwRB5!bT zfadT(%`4Dg(tnUP3D89t(2GF8#hFaEy96|X!Ff>u6o4|&3kg6vRKObqK?#r$fE_{(4is?LO%9qdL7ED{sRKPw zN}wy*xgaAFkU<>qxre7fJsHq>2H^ApJ*^&kzuM@1Haf5qi$Ny=fYQ@|-)93lsUB1p z@q?Nv9?)irJTw8+Gk}KvP91y+&G(@7ilDj%)Ji${3U)^b7wAfzfKI;*P?qF}7Ea)V zBn=q@0N2y~m=E70Oj&`$JIoo+RqVW6rj26;RH+~0xT zD+8*(VCUkKxbGDsFDXoS^{+a zGC00Q@9!Bh_xFJ6W$4`&khWe7xZQB@1^fama9T-#)(6m%AJoD+(Fq#x3IXMQcJT2+ zkd;`Vx(D2%0o@Rk1K#Qc%Kf0*>cACG0eH6)WONgF#R{*Y5z(eA&1eF3Bezu1kxCXjm z=R{{%4Wy9^+rJ1duSV}T8oKuz!7DLP3k0%n6;>vJx7>k;=j5O_{DJOMN&%Ni2VY5o zilzUI(6QDGSg{1|0)QtgbHD`>WKb4zw>jt%+Y_B`1&}SaFW{+J5Ht>00zFoMwfUhu zxcm}>m0vkXTfaAPS1 zTmXTW7J=JK(8vL8mj&$}dVv}{Dc#{Qm@diac29sclfc6xInb>{@SZZbz$@r>&j9sA zK_?`FJLM&i{h+TvHwuB%4A>c<_EZ6KdusH)r=fq}6G}#B1r-=E(1J)5R1ksMN8rJ` z1aLug@CCfP4j!jVK`x6Pg3bvAO~0nVYrI#WE%parz|tQ0yaxDfG@#=qK!q0gloK}a zgf94U$P(xgPoN8h!Kn{)9=jVT^?~-9LdJi<<<;nYRQT?r0&kk;WXD$ptq=y4J7 zS_0h83IX@F4SLuPP_0|yu=#er4;!N;rR zL6@V|cf#sNIZy`q4_dd6R6WXq1?8dDH>i39Edv?w_%ld*Jp;6UR)8(Z_;1f}@D*rF_rVvS z6Zk;~Pk_&+fah}Xn5hQzgbH}{fpfVIv`G&iLkCA6EbYNd1n^k83Fvy`i%4VO;6lNo z+ua6M5%Gfx85{7JGPDu%5FF1A;1NS;5A2~Js8MqYye!%UI#LK9e+Ku!JfIyyc(M=% znd<}X5F%ZV1xc0QJ1F6aMHHkq1X|F+3nXyMD5Bdv238q@i=!BD8F%m%r~?SPG61pz zHwDzevI|wg3Vwu5N8?Z<`P=hXvR;10WzX#nr@ zdI{RPJb?B??!|&$mgLiU0Jg51-thvn7kpn}2)H{tTPbLSCY9!ulq#6$S?U?;8k&`Yxn`OW0g$mGAcBE` zfw3xxfw4k>QJROHV*(=s1E^&x1C?We(jYMdC|emygT!2*Y&$3o5{rPc?V&VCEC+aF4U#CRAX z9AtG0P%&h6CQz|3h;ndPc|gU&k;G!Kh!sG^jFHr}K*i#b#AaX-TLTq~LsEADDu%50 z3REl&N!<&mSUQr}AE+3zIsqn#f@lZ@_8aI9W)L?YL@+?oh6Pj|vX~DRu>>q)B~Y<) zB=b73h|PhDl_9Cy02Px%5<3DFgQYo8T;70+MIxztgGG#i8KM9=E=8bXi4ZAp7-(P- zv%w-302M<{rzudeA|&%Fpkl>HVm(l?QY5hjP%-3i-U1b?L{fJGD%OM~b_Xiffh6_; zDu$fjSfCla5lNi{R1DcX9jF+xc@9vqS|q(8P_ao!Vi{1eJ|wXksMrJ~u?bMITqLn2 zP_Y?EVmqK>u(ARa=Vzc|vyjw1fQt1ZiG6{JEkP3FV1;my#blsju(AhahXGU!IsLgn z#V+zg%mAs2fQrrJMu_D=#b9MFNL>R|3^|;qK*j1ITEXdW1yl?<4E8|9s*%)PfQt1a zi9LaeO-2&?0Tr8uB*w!A;UK3;1*q6Oh!of@CQz~YNMasPv1LeNF;KDDNMZ$0v2{pd zEl{!bNMbXfV%w0!)h3_r?jnhOfQtP_5@X?naA0i$P`;Fa zig7~gNRXHgR17)4IY7mb)rCOCSdsK*K*hw7#A={o-yzn6!+ru(Odd(y5~!F2lGqNY zm@1Ok8K~HGB=a6X#k7&seSwOpBZ+ZvK{Qz)iOE34klkzm6|+ZD=K>WoM-q#GiUlKy zwTR7?|EUxUO}K*f;D&OK1ELlCEf!|wuA3^`9ffr=rgjUQ04 z?MQlgpk+1qpi5{wM*%8!8A+W9RO~5|m>rZe8Bnq3 zNMdWCVr)oZdH^cMj3jmiDkh2~_5vz~TnGGtipe6W6X1bZD2*hh0u|Fk60?AcA*T%= zsF)d&x&)}0F_KsbRLl)YtOF{>jU+Y)D&~tMwgD>UjU;viDi(_*b^|KrjwJR5Dwc{Q z#=r~lZ!(gY2vn>LNlXJOhTJ!>fr@b;sSALLu^@@1K*f;5ssbwZ2Vy-q?e;*$kn{cm zsF)0rd0U`j`bc6Ypkl~r_YPDn1xeips8}SD7z-c75qU^r5>PQOBrzSRSOb!n15~UW zNh|~^)`ujP0Tn~eZ#7V{ZX|URpklB-5h(93fr>3gQnv#tR){2a1}e4|N$des3|a3N zsMt;nel9&NhYzLB<3smefl2`;(Y$1|Z4pi(8l2`*&>;{t96sXug zB(W7xG34~O2P$TWR8C%iie)0{eF7CjE<1lf#h8%P@qhxGfq?G zBpYzr^?-^YmnAV!G32tO04k=7WL^tY>@kvgGoWI&Nb1%=#jKIU4nW0_%ZMvbG2}Am z1yl^VjQ9f;i$*d}KoH`HTqH3Ss2FT+2BggbDprf6&Ic-1jU<);73)P3D}jm?BZ+lD z#pWW3&4G%|MiScq6+=$rN1$TJW!Md<*j^;PZ=hnkk;E8;ApSjzBqjnC+l(Zp0TsK8 zBxVB@yNx6k02TX)B$fgd%Y;}3PA?TuG35B@fr=rQO$(r6IY{Phfr|MfiJgFo)gX!8 zfr?ckiG6^If##t>&H=?Gi!g}Az<``iC7@!ku@sOx9jMqzB=a1gVo#97LZD*Xk;F2f zVn30@YM^3ok;EoI#n_PY^b)8Ta(>$Z6%$5McLplPjwJQ~Dh3(m^Lw49wFjuFNe-Ok&V0E-8Z088B8+ zYEGhFNoqw2gI-EvNg{(@QgJZ@SU|5dFDElQHK&q653HytAEF>RKQ}iuuY^G_FTW&J z&)qLnx40xRIUA}fH6uQ)C^0t`Y%dN1;fd()rTtNQlfT}M*6JG=szW^17ja%=5iVHxK3T*uO8dSUh zP5c8?95y}$8^h*eg1FxRnzUf%fXXY7J10QJVPn*GQ1uMZu?*OFb_7(s04fesUj!9@ z02POgYxh9K9iT}OHom$HD!u_t926fQ_b5PKfK z95m((viAT~JuIEffvQ)4_7`Ad;cKAc6VSx>K*brLZEKkNGf?pYs5mUYfX1f5_Cm#B zW8-h2>K&l%Q`lJKU#R#2s5oqloR0+(o(|9g0v7)AQ1J~=aoAY70aV-o+7^P1yE;O} z4?x9XW86Vdaag+sRz4&{#V0`96R@%K5~#QWw2pS5#a(NJ*%Xaxex7o||~4N!5|IDJ1tk89X5BVPpMKQ1J&)ao8Ak zDOCJ|CqzAL%zpw@e1jK695$A?8Y;fP8zK%H`#%O1pWp)#hmHR~go*n?#9?Ewf1u(Q zpyIH30A5are;c3;4OqO(LB$QA4Gq|QfId|G15_L~Cjc5B2gUD#0EoS?_E#WOeL^5a z92W0MP;rGIh&XJ1pcpFt0V)p5M=enChG2+#*tqIcsCYpLL>v~*%c0^4p%8J{nD9=R zIJ5$U%^94AiaS6XO0cn!yD)KR#S5D|cn=eYRr_uR6GHip<(k3TcP3;QXuMKa}CF#;s>C;9N2urO{h2nv_S{+ z*K4S_LI%Vf*u2AEsCWXjQwN)S;OB;f^MWjhdf5DfB2=6Kx(Ef9{*9sH4p4E}JcJ`u zJfRR`4s0$W0xCWMS^&f9k#wl|2B^kVzgPqnH>iWC2laD6ti4ci2WSNgo5ui+--F7H z0BA=JHka`ks=ff);DXhge7q3<3P3Ax*f^vnRNSB$WE*0P#|0`L&;k*M&27X(#T&qP zn=vrJ(qR=;d_fyTJ#3C+DpdSIJ475d=C}bWt^n#tGB7Z}<~mM8#S6M1>S6O8&!FNB z-4JoucqSts#9tSBAmXrj4;iSq0<-}L8%HyTiZ?(DNZ9;G5LEmEv|$LF1IdMoJ3t$l zurb6AsQ3nG0K?`&7DL4irb65UtsWTmLB$)O;;=c9^HA{#(1r|bUIa8Q4~pLf(;?=- z!ucaq{Q+ph3N}BFF12ZA&Vda=GRQv(7K?<8EQGtpx%z>zf&6Su!#SP{{#9{f! z4Js}GZ9u{1Ov0h!4bTP@Y-}y!kl#NjF1<(c>Y%b+6RNMgCK!%kMSE1q`pyIGOmFH0LhE)*rVe=|K zq2dB-AmXt35pDr!IIo3>!{%3Hq2dnEh8}GESsyC?0orhYnd1l*U$6;c4s5O^7%HBy z1tJccZ%KuUZ`cYEht0W^L&Y1Q9)-=jbV9`)c0$y{=3Zt)#TB3hEo}Z}EmZshbkQJe z4rV`8d;_!r0-J}q2o;|IZLq-RVjjc9p#?lF{eOjuKY)tE=46-!A>pZT6yi?Uyo?l7 zd;+uq3#&&oq2e2$;;{J{XQ=pr;}COT?T}cQIJBVzo2Mya1mlYtlaa2iW@*HG+4O`no9tc8wH?&H3kL-Sh-gZReu4Rp<#16v!LPz&;|=^ zUS}&*e8CNfy|B5R3sCWbn-Fo>{LX8r_=j5%ao8LWn=r&*1$Q9guz4Ous5k?(LjarW zv4M&=KsyAm`JQm7xWYq-Ij}jOBB=O*M-XvXzUzgG7eE)c!piLxQ1J)Q3LiHAa~LYV z09ql#=78=&#S_3Ao){Ql^FTkL;s($PA2t^xA_DR6hc^)Sz~+PWq2d>y4F}kqkQY>Z z!v~0Z*t}3GO#Bl>9NLUy0L=%0;#UEhU}5n)5vso73q(Dv99{$!e*kSj!RCoJL&XE2 z4Pn?^(GjTl0%(IBHeYlVDz5MgVm@rn=owUe0W`zG=8e8X#SQ*K)Wh-%hbSa`0-y~d z*!+~v;hJQZieYFap+=b*!L^q^zyM-Ce2yw9v!oy~IXh7=nIW$< zCkMn%Edn!3^B^1xhWPlj#A2|r^30Nq{L+%##NuqjcpMT&xFn2mNtoc0FvTTdhD*X6 zmxKi_2}@iOh6XrgaQfX4r{@iE`rZ(y_YHCS-wg1|p9L z4+tL-3J^ZR?+`x1#}GckpAbI6cMv{O;)V*KyAUZULsg)=5h>w9RiL{PDVai5pt}<( zQ9@OqyA&x2LR287Jg5MAU?Qb6s0#GpL`qdq73cwql!l-x(1R2y#XwXbB^0OtdYFQf z1*j~A$j4hSfH8NW2Df4WJM&k5CIGoNTCE#ffP0n0i+Ot z2q1+6L;%V65CJ4VLj;g~3K2j~{7?}Lw;EzN7CD7ObzwLcIXy$wU^o~#^+MHPI2k#u zLewB9QK$%pn~@VFR1HQ5BPTzo8jLVTPHa#$7@>@uw4iD*!WlUsLDV3pAE*d8oIyo4 zgbylvKs@vU+|UfA1UE9lRf6M^F^b0}V~kVA1gDHCP8lEoz%6Bn zTMBnD7~&2GL)-yjh&v<`6p<#?WG>nl#1Ckk$)ItO>R3L>CLT;%i&RfFMR;du6 zOK>9-loH(72v-S?OU5W3my9t^855i`rZ{EHaLSnDl(E1mV~JD7&;YlTA#N$$!C;6x z91L*>gdy&bFvJ}chPcDR5O-i0;tmZX+@WEFJ2Z@NhlUaE&@jRs8b-K7!w7e17~u{L zBix~3ggZ2hafgO6?$9ts3JpkRL{bY8z)*n{N)Q!DVFM9B3K56^QaC^akbDmjK=Lz0 z0LiBi0p!FF6~S<;A%^ze^=sF&lxCC@v{72|~ElgYiy3Sn> zI$sMD*MP2PKL{O{hl#`1t;a#f2Q4jwi64ZfXOKA{t6<_Mk;Fkk1rxuBB%aN{ zzyK4!i6jnMG6xfXh$If0n}UhIL=p!r$%Bc1L=p!r;e(0)L=p$>$$*J7LKpml+z(m` z2ovW-5(h0Egoz6xiG!9J!o($!#6e3FVd9EN;-ICBFmX*JanRC7n7AR5IB2OPOnf1d zcq~%5twa(BEyaYX--sj*TDl1n--#p+TIvZCKZqm_S~3a~KZzs`T4D+lzlbCbT9OJA z4+JF+1_p4t0xfNYiAN%dgOg5n*t6a*&D2wi{y5(h0Efr)b>iG!A!z{CZS#6e3_VB(TU;-IB0 zFmXjBanRBin7AgAIB2O1OxzGj{3KGmTOx^rmfXP9J0gjLmgvC5J(0vgOL}19fk@(@ zss<(=i6njm$=*aHaS#)xJ`==&#ycozVd91$%}{aB5+<0qC6YL3O+8H95lI}h6bdHp zi6p)k$vuHc;-Do}F!hm0;-Dp1F!4kr@m)yfWFm=!mU6+=7b1y+)|kV@E0M%OOTu8{ zjY#64JqIxHP9$;A(lVI%L?m(0QZ$(OOeAs88h@DhLL_m}k~Wz5L{Q{G!x^*$4kkVm zNgT9f4ko@3NgTAM9wxpLNgTBF4ko@4NgT9P4<^17NgT9>7bboXNgTA~4<>#RNgT8` zA0~bgNgT9v5GH;TNqjj{d_6=G2Q5v6seg$i4qD0x6aR=L4%!0*6aR@M4q6fk6K4cv z5@>jWmQcdPIg!K{BDqHpNgT8k6Q*7gNgT9v6DF>RBo12Y2@}^u5(g~}g^3#?iG!As z!o)3+#6f#yVB(HQ;-Dp|FmX>LanKT0n0O$PIB3Z$Ogs`v9JE9hCZ32S4q8$R6VF5v z2Q9&ci5DV?PeBT|N+fa65?`45MkI02l3kEH%1k~nDT5lsD0BynCO^^DMkz94t+&uZScL>T|%vHIc+YOR!+#hDhR|HSaKSOC)j7QZAUdBa%1^l6yRn#6e5K zVCn;r#6e5QVB(QT;*3b2Y)UQMm2QB%7iEl&_2knJ|iSI-b z2Q3|hi9dw)<3Q;Lv?dNF{t`(Xv}6$`{t-$1Ba%CRB8h{RKEl*98ba1@fy@W(*@B64 zB8h{RP{PCok;FlJ#$n=;NaAmh>{Ub(2QA%%snKOgs`v{0WlziAdt0HN`OXnMmTGrLZvZLL_m}(pi{z zC6f39B=Z}Q#6e4NVd^`P#6e4TVd4{!#6f$)VB#~8#P1-PzYs|rw1*F-ekGDPXbCY) zd?S)LXvr~5d?%7PXo)gR{2-DzXh}0n{3Mb%XwM={{34P#XlXP|9MTJA02Kk?K7#;s zJO#S#9WK5>3!)ObjT$Z<02POB`-O{t(1fU0fKYJp2~cqjG;s~6IP6?uQ0@TbjRR1H z(CxHvb0VPP(CwmdaSo_BbXz4{e1bZ}LQvlX#DbYG02K%It&qhhsDXG444}Rthy_zG z0Tl=Jd6C6ufDWQyU|;~9BY`X~02K#~2_TC%sDOA344^R$5DR9$15_L|R)Z|Q1A5>E zblVQxUj|TdP@4f+{R$-zkAVTy76P$g=4(L3L2W!_@d=6`9s>iY?FwSS)H6WEL2Ykj z@d9YXg8CH5;tWu6P@f1{JVPGDV_*RF^*}6``4^xYxj|_gS=<0B4$8C0;tOO!JO&0( zSp;Ii%=sY$;xRCQ%2*H!CLRD42h|bC;wzxziQs?)oqG*!Yg)k4Kga+i@eZgssI7u* z&H?BKLfF1ZkTy`dt$>Py&g(!{ZvYk704YQ=|AQ$=6$1mbx&S*1WX=MpIBb6;NE=8z z04feUmjoux02POwLk$x@U;;5e03?89Zvj+10!>^3DxQEQegJy#M+TaB0aP4zt~E#+ z{rCLRG5hn<5CGbaElJ^@Yrg&dHp85m}uiJyRq zFF+GN02N<>CcXnI4m(#JX72{5_zpDnE1=?_^Kd{efT>>q6+eNdeg;(h0-E>)s5tDL zc9=OGQ1J(7>KmZqFVMs*pyD6U#0#L}KhVT8pyIG2v0(NlK*c$r^FlE32&g#h+$ET} z1T6idsjq;FE1-!tK*vKt=lmdtTLo0y08M=XRNMkhJOe83fF_;*756|BkARB9&ZURB zKL9Eofu{ZjbbJ|ho)k>|1*muin)(w^@d7mQ15k0;IrlJgc0k1&(9~~$ig%!iuYihA zKoeg86`z47ZUGgCotqD{*8nQM0!_UJR2+6r7EHYYRD1`TdI_lb0W@&|s5tB#ewaBN zQ1J_B>KUNoH_*g?K-13yH1Q8m@fT>~FQDSEbNylVK7fki(pyC~9;to)8*u4la^DUs_GtkrU!bYifQo-W6IXzW|3DL$ zfQmCf=ksCa3qZwT_dvkJIiTXObM9c`3{Y_iG;@A{CSe#D7!=UN4?yDuwvGU1jsJ04ff< zM+4?g4ygDBH1!Nn@f~R5KcMk-08RV@RQv>*_zS2w?A$|`y$_(`H_+7IfQmmr6Tbiz ze}N``0xJFiP5b~<9CptK%-$VPaR%sO8kqP7s5l3j_zI}F0Gjv$sJH~0_zb8x?A{QV z`4gbx8ffYrpyIG~G%)oCpyRU^XzF)B#U0SZH$cT<_mIHMSpgLfKvTZ}DjtC*J_9PA zfF?cxDxQHR-T@Vd-D?7~w*e|%fu_C!D&Bx5UH}#EKoie^io?#qgqfcJ6^GrE0uzsb ziZ4JjCjctG0!`clD!u_t+yN@S15MllDh|7M1!k`SRQv>*dJU-f1vGI5sQ3*uaS5n6 z=-fHv@<9M94!g$%W-kX+`~#Xf3{deOXyQMh^G2}qLt*B8fQoZK&k2QzzkrIv?uCJg zKY)r$psBwB6<0tLzW^22KodU!6*oW=zX5eWY#kZQ-VEqKkpr6g3(#?14>a);Q1JjX z@dHrt2sH5>P;uD3H86WOK*clA)USYw!_HlWsb2sUuRv2j11jEtCO!cw4!Z{jW=;oG zd;*&K2B`Q9H1P_k_yRQX0;u>3H1Q0mIP6{>n7s*5anSjJ$muWwDt-XXoB*i!2{dsJ zsQ3jmaR;b4?4BN&y%tb$*m<=uaRaFM3p8^ypyD6U#1)|8KhVS_pyIH5e_-YdK*c$r z696!A4yd>Qnm7YgTmnt}2Q+;upoxEgio@;^f|>sUDsF(L{sC0n0!_RCDh?fP1`lIF zyZE4JVt@``!o^|p7O;5%kXjIi%^$>obwSomfy80s^3Y){xH+)#?iMul-~n3(1_szT zJje_XhKS5zzuyJpYS`dbfH-Y9@L0Uj!APgIqfsI## z#6TD}egQqs0`4zZKOZ*E3{nfiu>PavvN3NdAJAUm_sQ zNaC=32phKqNrN!#+-MsR2T48b+-TT%BS;#AVdqAtfH+9%VdqA}#tA{vAPhS<8gx!I za{R*1jfRcyfz*O9?A+)rU=C7x1}#WtU|@ia%Ymdp7 z=SF`)Gaq(tG;CZ2q!xr>=SIsw`==l=5Qd!_4I2*uiGeWe+-Mgx^{{iJVdEGewIB>T zH#!GRJ?z|Q*!TlTEeONTjh=$09(HatY}^2(7KCBvM(;sW4?8y+mhVAoK^S&!^b<7o zuydnf{dAC85Qd!_%>(V%fW$x;c5XDRe+?1?Vc5CRCTQwm=SIW&#UQmH3_CYE23@^2 zD7G0GVC4`F7KCBvMsGn=4?8y+)-DC91!35^(Ra|) z!_JL{wI@MpL3jf6d}tPEdkiE7!mx9rVeLSW7zo48jn+X^4?8y+R)2!jf-vmd=nypZ zuydm`(8OWqM%SQ;!_JKcotq8H$sn^q7J2(0Px;XUQ z=r3sEuydn1p!Y_A+ycT5(1j;5=;EMF01ONa25922bE93*#9`+~N1%zr&W+AN6NjA} z-GC+zJ2!d?x;SVv1Oo%Z3N&%pxzT&j#9`+~UqBOwog4iGO&oS^^ba&~*tyX>(EBq$ zVFtpmbE6f|#9`+~o1lrq&W-j!6NjA}9fKwgJ2$!jO&oS^bPJj|?A+)XXyUMQqt~E` z!_JL9fF=$*H~I>iIPBc$7ii+JbEE&Di$l+i7J#<LuydnT(8OWqMq8kX!_JNN zK@*3a8=ZhA4m&rx1Wg=vZgdBlIPBc$Iq2e`&07o%3>(nIVdq94K@*3a8+`*!9CmK> z8#Hm)xzVt8E+}k37I5ow}3G0+-T4_-pKV7?A&MrH1)7^ zqg~L%Vdq9ipozoIjm|+6hn*YUfF=$*H+l-1IPBc$6=>qHbEEg5i5~!MUSVKhxPT@O zJ2(0XnmFv-=pSg}uydn%p!fZN!VH9A=SC}_iNnr~HbE1Iog3|eCJs9{ItEP~c5ZY5 znmFv-=oU0_*tyX&(8OWqMz28^ht3be`lBE}fiUdc=qqUIVdqA_Kog$;ouB%HCcXhG zE&v_)0+|iMuydnT(8Zw(HZ9P^VdqBspo^adrB()p1T=BjxzQzP;;?h0JJ7^o=SI&# z6NjA}4eNh{+z!I9bEA)-sfV2#eFIG#c5d_=G;!Fu(G1Xgo2Q+cmxzQ}p`>sG?2Ewp&qb1P9VdqBcpozoIjdnm2hn*W8f+h|-H#!4d z9JC38fq|h0O&oS^G;DkY4y(Y8g3YJggw6}WB%sAFjLRU+zyMx)22&4{&}ami%D@l-Eq6f8D3A!WJIw&w zX9C)W0TPD}v%<79NE(4e;aoTaT@B=74T$c5>Q{iKTaXwC z!`u&IgD~g}32gp%fF1yE0ljY! +#include +#include "simdcomp.h" + + +// compresses data from datain to buffer, returns how many bytes written +size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + uint32_t offset = 0; + uint8_t * initout = buffer; + for(size_t k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t b = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + *buffer++ = b; + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, + b); + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + buffer += b * sizeof(__m128i); + } + return buffer - initout; +} + + +int main() { + int REPEAT = 5; + int N = 1000000 * SIMDBlockSize;//SIMDBlockSize is 128 + uint32_t * datain = malloc(N * sizeof(uint32_t)); + size_t compsize; + clock_t start, end; + + uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); // output buffer + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (int gap = 1; gap <= 243; gap *= 3) { + printf("\n"); + printf(" gap = %u \n", gap); + for (int k = 0; k < N; ++k) + datain[k] = k * gap; + uint32_t offset = 0; + compsize = compress(datain,N,buffer); + printf("compression rate = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); + start = clock(); + uint32_t bogus = 0; + for(int repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (int k = 0; k * SIMDBlockSize < N; ++k) { + uint8_t b = *decbuffer++; + simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); + // do something here with backbuffer + bogus += backbuffer[3]; + decbuffer += b * sizeof(__m128i); + offset = backbuffer[SIMDBlockSize - 1]; + } + } + end = clock(); + double numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); + printf("ignore me %i \n",bogus); + } + free(buffer); + free(datain); + free(backbuffer); + return 0; +} + diff --git a/aux/simdcomp/include/simdbitpacking.h b/aux/simdcomp/include/simdbitpacking.h new file mode 100644 index 0000000..301f4f5 --- /dev/null +++ b/aux/simdcomp/include/simdbitpacking.h @@ -0,0 +1,21 @@ +/** + * This code is released under a BSD License. + */ +#ifndef SIMDBITPACKING_H_ +#define SIMDBITPACKING_H_ + +#include // SSE2 is required +#include // use a C99-compliant compiler, please +#include // for memset + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +void simdpack(const uint32_t * in,__m128i * out, uint32_t bit); + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +void simdpackwithoutmask(const uint32_t * in,__m128i * out, uint32_t bit); + +//reads "bit" 128-bit vectors from "in", writes 128 values to "out" +void simdunpack(const __m128i * in,uint32_t * out, uint32_t bit); + + +#endif /* SIMDBITPACKING_H_ */ diff --git a/aux/simdcomp/include/simdcomp.h b/aux/simdcomp/include/simdcomp.h new file mode 100644 index 0000000..8875f0f --- /dev/null +++ b/aux/simdcomp/include/simdcomp.h @@ -0,0 +1,12 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMDCOMP_H_ +#define SIMDCOMP_H_ + +#include "simdbitpacking.h" +#include "simdcomputil.h" +#include "simdintegratedbitpacking.h" + +#endif diff --git a/aux/simdcomp/include/simdcomputil.h b/aux/simdcomp/include/simdcomputil.h new file mode 100644 index 0000000..107665b --- /dev/null +++ b/aux/simdcomp/include/simdcomputil.h @@ -0,0 +1,29 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMDCOMPUTIL_H_ +#define SIMDCOMPUTIL_H_ + +#include // SSE2 is required +#include // use a C99-compliant compiler, please + + + + +// returns the integer logarithm of v (bit width) +uint32_t bits(const uint32_t v); + +// max integer logarithm over a range of SIMDBlockSize integers (128 integer) +uint32_t maxbits(const uint32_t * begin); + +enum{ SIMDBlockSize = 128}; + +// like maxbit over 128 integers (SIMDBlockSize) with provided initial value +// and using differential coding +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in); + + + + +#endif /* SIMDCOMPUTIL_H_ */ diff --git a/aux/simdcomp/include/simdintegratedbitpacking.h b/aux/simdcomp/include/simdintegratedbitpacking.h new file mode 100644 index 0000000..18ca795 --- /dev/null +++ b/aux/simdcomp/include/simdintegratedbitpacking.h @@ -0,0 +1,27 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMD_INTEGRATED_BITPACKING_H +#define SIMD_INTEGRATED_BITPACKING_H + +#include // SSE2 is required +#include // use a C99-compliant compiler, please + +#include "simdcomputil.h" + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +// integer values should be in sorted order (for best results) +void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); + + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +// integer values should be in sorted order (for best results) +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); + + +//reads "bit" 128-bit vectors from "in", writes 128 values to "out" +void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, uint32_t bit); + + +#endif diff --git a/aux/simdcomp/makefile b/aux/simdcomp/makefile new file mode 100644 index 0000000..6ebd9d9 --- /dev/null +++ b/aux/simdcomp/makefile @@ -0,0 +1,54 @@ +# minimalist makefile +.SUFFIXES: +# +.SUFFIXES: .cpp .o .c .h + +CFLAGS = -fPIC -std=c99 -O3 -Wall -Wextra -Wno-unused-parameter -pedantic +LDFLAGS = -shared +LIBNAME=libsimdcomp.so.0.0.3 +all: unit $(LIBNAME) +test: + ./unit +install: $(OBJECTS) + cp $(LIBNAME) /usr/local/lib + ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so + ldconfig + cp $(HEADERS) /usr/local/include + + + +HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h + +uninstall: + for h in $(HEADERS) ; do rm /usr/local/$$h; done + rm /usr/local/lib/$(LIBNAME) + rm /usr/local/lib/libsimdcomp.so + ldconfig + + +OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o + +$(LIBNAME): $(OBJECTS) + $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) + + + +simdcomputil.o: ./src/simdcomputil.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude + +simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude + +simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude + +example: ./example.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) + +unit: ./src/unit.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit ./src/unit.c -Iinclude $(OBJECTS) +dynunit: ./src/unit.c $(HEADERS) $(LIBNAME) + $(CC) $(CFLAGS) -o dynunit ./src/unit.c -Iinclude -lsimdcomp + +clean: + rm -f unit *.o $(LIBNAME) diff --git a/aux/simdcomp/src/simdbitpacking.c b/aux/simdcomp/src/simdbitpacking.c new file mode 100644 index 0000000..556a845 --- /dev/null +++ b/aux/simdcomp/src/simdbitpacking.c @@ -0,0 +1,14008 @@ +/** + * This code is released under a BSD License. + */ +#include "../include/simdbitpacking.h" + + +static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) { + memset(out,0,32 * 4 * 4); +} + +static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_loadu_si128(in+4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_loadu_si128(in+6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_loadu_si128(in+7); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + +static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + uint32_t outer; + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + uint32_t outer; + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + uint32_t outer; + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + + +static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + unsigned i; + for (i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} + + + + +static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,1) ; + _mm_storeu_si128(out++, OutReg); + + +} + + +void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + uint32_t outer; + for(outer=0; outer< 32 ;++outer) { + _mm_storeu_si128(out++, _mm_loadu_si128(in++)); + } +} + + + +void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) { + switch(bit) { + case 0: SIMD_nullunpacker32(in,out); return; + + case 1: __SIMD_fastunpack1_32(in,out); return; + + case 2: __SIMD_fastunpack2_32(in,out); return; + + case 3: __SIMD_fastunpack3_32(in,out); return; + + case 4: __SIMD_fastunpack4_32(in,out); return; + + case 5: __SIMD_fastunpack5_32(in,out); return; + + case 6: __SIMD_fastunpack6_32(in,out); return; + + case 7: __SIMD_fastunpack7_32(in,out); return; + + case 8: __SIMD_fastunpack8_32(in,out); return; + + case 9: __SIMD_fastunpack9_32(in,out); return; + + case 10: __SIMD_fastunpack10_32(in,out); return; + + case 11: __SIMD_fastunpack11_32(in,out); return; + + case 12: __SIMD_fastunpack12_32(in,out); return; + + case 13: __SIMD_fastunpack13_32(in,out); return; + + case 14: __SIMD_fastunpack14_32(in,out); return; + + case 15: __SIMD_fastunpack15_32(in,out); return; + + case 16: __SIMD_fastunpack16_32(in,out); return; + + case 17: __SIMD_fastunpack17_32(in,out); return; + + case 18: __SIMD_fastunpack18_32(in,out); return; + + case 19: __SIMD_fastunpack19_32(in,out); return; + + case 20: __SIMD_fastunpack20_32(in,out); return; + + case 21: __SIMD_fastunpack21_32(in,out); return; + + case 22: __SIMD_fastunpack22_32(in,out); return; + + case 23: __SIMD_fastunpack23_32(in,out); return; + + case 24: __SIMD_fastunpack24_32(in,out); return; + + case 25: __SIMD_fastunpack25_32(in,out); return; + + case 26: __SIMD_fastunpack26_32(in,out); return; + + case 27: __SIMD_fastunpack27_32(in,out); return; + + case 28: __SIMD_fastunpack28_32(in,out); return; + + case 29: __SIMD_fastunpack29_32(in,out); return; + + case 30: __SIMD_fastunpack30_32(in,out); return; + + case 31: __SIMD_fastunpack31_32(in,out); return; + + case 32: __SIMD_fastunpack32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpackwithoutmask1_32(in,out); return; + + case 2: __SIMD_fastpackwithoutmask2_32(in,out); return; + + case 3: __SIMD_fastpackwithoutmask3_32(in,out); return; + + case 4: __SIMD_fastpackwithoutmask4_32(in,out); return; + + case 5: __SIMD_fastpackwithoutmask5_32(in,out); return; + + case 6: __SIMD_fastpackwithoutmask6_32(in,out); return; + + case 7: __SIMD_fastpackwithoutmask7_32(in,out); return; + + case 8: __SIMD_fastpackwithoutmask8_32(in,out); return; + + case 9: __SIMD_fastpackwithoutmask9_32(in,out); return; + + case 10: __SIMD_fastpackwithoutmask10_32(in,out); return; + + case 11: __SIMD_fastpackwithoutmask11_32(in,out); return; + + case 12: __SIMD_fastpackwithoutmask12_32(in,out); return; + + case 13: __SIMD_fastpackwithoutmask13_32(in,out); return; + + case 14: __SIMD_fastpackwithoutmask14_32(in,out); return; + + case 15: __SIMD_fastpackwithoutmask15_32(in,out); return; + + case 16: __SIMD_fastpackwithoutmask16_32(in,out); return; + + case 17: __SIMD_fastpackwithoutmask17_32(in,out); return; + + case 18: __SIMD_fastpackwithoutmask18_32(in,out); return; + + case 19: __SIMD_fastpackwithoutmask19_32(in,out); return; + + case 20: __SIMD_fastpackwithoutmask20_32(in,out); return; + + case 21: __SIMD_fastpackwithoutmask21_32(in,out); return; + + case 22: __SIMD_fastpackwithoutmask22_32(in,out); return; + + case 23: __SIMD_fastpackwithoutmask23_32(in,out); return; + + case 24: __SIMD_fastpackwithoutmask24_32(in,out); return; + + case 25: __SIMD_fastpackwithoutmask25_32(in,out); return; + + case 26: __SIMD_fastpackwithoutmask26_32(in,out); return; + + case 27: __SIMD_fastpackwithoutmask27_32(in,out); return; + + case 28: __SIMD_fastpackwithoutmask28_32(in,out); return; + + case 29: __SIMD_fastpackwithoutmask29_32(in,out); return; + + case 30: __SIMD_fastpackwithoutmask30_32(in,out); return; + + case 31: __SIMD_fastpackwithoutmask31_32(in,out); return; + + case 32: __SIMD_fastpackwithoutmask32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpack1_32(in,out); return; + + case 2: __SIMD_fastpack2_32(in,out); return; + + case 3: __SIMD_fastpack3_32(in,out); return; + + case 4: __SIMD_fastpack4_32(in,out); return; + + case 5: __SIMD_fastpack5_32(in,out); return; + + case 6: __SIMD_fastpack6_32(in,out); return; + + case 7: __SIMD_fastpack7_32(in,out); return; + + case 8: __SIMD_fastpack8_32(in,out); return; + + case 9: __SIMD_fastpack9_32(in,out); return; + + case 10: __SIMD_fastpack10_32(in,out); return; + + case 11: __SIMD_fastpack11_32(in,out); return; + + case 12: __SIMD_fastpack12_32(in,out); return; + + case 13: __SIMD_fastpack13_32(in,out); return; + + case 14: __SIMD_fastpack14_32(in,out); return; + + case 15: __SIMD_fastpack15_32(in,out); return; + + case 16: __SIMD_fastpack16_32(in,out); return; + + case 17: __SIMD_fastpack17_32(in,out); return; + + case 18: __SIMD_fastpack18_32(in,out); return; + + case 19: __SIMD_fastpack19_32(in,out); return; + + case 20: __SIMD_fastpack20_32(in,out); return; + + case 21: __SIMD_fastpack21_32(in,out); return; + + case 22: __SIMD_fastpack22_32(in,out); return; + + case 23: __SIMD_fastpack23_32(in,out); return; + + case 24: __SIMD_fastpack24_32(in,out); return; + + case 25: __SIMD_fastpack25_32(in,out); return; + + case 26: __SIMD_fastpack26_32(in,out); return; + + case 27: __SIMD_fastpack27_32(in,out); return; + + case 28: __SIMD_fastpack28_32(in,out); return; + + case 29: __SIMD_fastpack29_32(in,out); return; + + case 30: __SIMD_fastpack30_32(in,out); return; + + case 31: __SIMD_fastpack31_32(in,out); return; + + case 32: __SIMD_fastpack32_32(in,out); return; + + default: break; + } +} + + + diff --git a/aux/simdcomp/src/simdbitpacking.o b/aux/simdcomp/src/simdbitpacking.o new file mode 100644 index 0000000000000000000000000000000000000000..b582a09c255c964330b7e35df36c786591bf7be4 GIT binary patch literal 74408 zcmb<-^>JfjWMqH=Mg}_u1P><4z_3CNA?g4Yc3==-;AJ@aD?eW0=&$^6gQLIl(*=$` zEc(oJ^kvZ}=A*yz{T+_pEc(KD^yX{!qd#9iKKd))J>clg*SA0_6OMi?`p9zhXVC}N zqYqzS1Q}BRQq6Ysu)hUih8loGUlx55g1OWIBmnXF@&FL~W6?)pkf9xei2bwZgCxlN zQZPdgfJ7e_eU^r~^a4l#>iq{G_RFGAG9d5Ef(`vy^g#~nLP(sne*h`@S@c2v=*QOw zj{Xp2J3Ijt^`Q8V7XUeq1r+O`l;G|FNeK^MtAkDYm7g8}GM5eHYYwoPzw+Y~K(6D0 zx-Pr`<|=U1gJbDs(I-B*e?ignvFM`!)WGH7_=ouyB>A)GgAm9>5vYmN!HEDIaL|wh z$3M)!AT2M8K1qPwB?UJ0XVC{4u-jn%1t|ds0@S~tgb9xGU-{5MIWV) z{(OD%=&yW8Vek@^UO&D*b@YcI-_c+B^T7qe&!P)Vpm+l(hF|&p;56{C=rbe8g`mh= z4$k;LUmpe~1W-u@DxliIB?B~AArV>hg%{*UE|8-Iz)1{}s-}ZW1W0i9LsLgPI6=HD z`Xmf8R1E9~aCQWl2u)q!WCBidpw#s%zaLyQye#@CfAr_;Lq~rIx(OWpSagvQV8<>;clF%4-PdJJez3~M{VQ~viF%OGAgKC|AaJlfZ=o6@- zIDGUcI5^tDRmjJpk3yg*GK@6N{e&xff5l}S*sz%%#Fv}BvPz2BO+`-sRB8tIDo<%lqMl%3@B_sO5kAw;$VagNDzD2fMk%vCIH;t@COBv zJE&~@$Oj5bP?0STty26AK+!4+jXZZydGi4lbD%0AJOJiGe^7byi5aAv3tZm*1h+!n zL50c(L0D1+7c4)EK1jgwCb%3y&YK{2fXiq|O8Hszf$iu=aE;dwD_MAIj zka7y_Sa58ENBLzlV-Og0&r-c77Ip;D?1QxUK@13eegQQY#gGl!ZAK zRQz8Al|qnG=U09^xJUt~RcIjxsjNWFzK=y8xxobuv`~dLRVBe;@k7up;V{3v3j?UX z0!}&~K&^FtXhXpt9PHrQhzS}s?gpU3L>Q6j!QGjkMHj%m15hafsw>>VMUE&;6)0i) zgUcLlPyn#O%qjqx1&S4Luz^cgPEgqiPF)|1K7v9DWE`ZHdI@T_Lkk;l5QEEEaIN;S z=p!hxg4Kdk7joJGCv8yLfu}Pbr~}~X3|_*69SI8CesDSiXTP7|Mj0%@AzC4zk`j`_ z!71(~D8)gX*9Z2%&!P(gAm4zK-^Zelj4)fF$?t;@G<^ENndfKG1yD|e)N&t-J~G3U zgHzzkq7Nc4AfGER>4UQsvKM|B9(-F z;9&h(1a3C;gOlmUqK~{splv z4k{=v@Pb<9pknd^7tB-#Py-)SL|zaBnF@+(kg2fh^&_Y*1~*;7W`cqio~A**0R_At zIPHPkn&1fjSo9H;QNYH4qwpnC830aupmf>}jxOZX0Zw}#A*lnNz`3Cgg(o#pQ0#&u z<73fBQ04~P0}k$&MIS+pOOSKhpf0%x%4(2;6*-xMorRvvK`9Gt7&Mt9A`|RXSY$#w z0v}+71SBDSEV{rA>bAhrJZL}y6gcp7BLZ?aEX{+m2-qrUn*S&TjX!w00j0`zaGHNv z1TNaZhJn%zs6ycf83xV((83067&y%%Rb_3^3=GNZunYjtUSMy2gk-OFa0Yl;^nn)^ z-e7P3EV>{JvI?9VAmclbG6kFgUP8KS;P3|f`(x2XP;mt-))D0d*x#^n0-PJ5S@Yo0 zAA)V5Ksb!lzkrnIpn@KrA3)Iv$q%ry#}8Cbg9{!=$qq{>exMxS4=!d!K`9zkvV%g$ zA5^j<>LO5U)EzWtb&(mAPhf%K2Wo(V3mkWF84pk6pyYo6l*Ze^$@FE>2M%aRL6W}& zD44*-A+*2**$L~pz@ygPBi$1_|D5U892&#{M<-;=vJQ;zF{#kSZoQ%Ll z7qsXE+XK!INbLf6@&`8!pqT^Qi~uJiu-(v71~YR&Q#-g0fHXl~7JUG9MZkK&cKxkFIu3P175A6 zrgm`VK&~C2P6gKvkj4vg#sa(hW6=dr#%hP<6j&kI2X^<*q6=amKZA1$w3-74J~(r{ zEP|K8ZD5alEV>A4H@8D`3Zl4xdIVBjKZvV!yI&)fM9M{mBqdGzP)^njxmi@x$4efatm zNHcis5R~7pf=nqm`ttP~kV!&Ef8K_U9T$BRg&71IBfSaLIUSrgUKV`woXdXn z1YRuq3L1F0c=YFO$lxJ(;P~b1vmhr!(i_AN!XPuiN$h3O zCo!-eK7#5hkcr^IgWK?)p3KpguTO&XgA>uiqEDcq+0#c~7F}XI`a=*ja&a3n$O!I& z{k#q7?)|*I9#Z&&oeOEbgU69THbAO~+mNpR#iFl}X7$h8kS^WLqAz?$p>77JC{03j%8KoUeasILba%zX`tQ~!XY*djC@JmSa#3NUcF42rLpMW4V45meTG z7ro0xIo6hQbpHDAd6Th2`iE!7z|+)RY1qAo&SRE)R=7gUWtz3IXL?L<#}- z1VKR#4IprW0tXBzQhx~gDZrBo)ccU(#he24Y-^Er)E$F0`2CEpc#sfC@~4j^6>$n5(u13KsEg*P@#VM zD8z~W;3NVa&$qEDdS z56FFxfux6^HWM^3SwV>noFu^I%g@`8;YV0tf~s7YOCiIM;6WT{+Jd(2K7s~Hzy$`l zz)JAfgB5t zc}Nv?dpf8x0+rCc4_ZH)c@j z28ybFNFfB8ON2TWRtSO0El^D{9VH!svk7=49+YNZf~T?E5sz1_9GtSi2^LiFR0RvIepsqP$)#ZlpyC^xs$Lc$g&8Q#poAGn03*x_K$^kj6iS$Z z)Pg4)p#cs~Rp?G3 z{=5xp#(<&<+!*>OaTF<@KxI-mxZq-eS_T>P1`Yaw@^Lt*q65v@z6R$DP=O1s_5DGe zm5&_Y+V$scNcw^lEk6XqzzGNBB2cRZ)^?@VeCuT*+0jdZ=GYA)tLJF`pP@ad5Zz9Ssf6%G~&^m+L z;-E$?XcYP~v|$S=$Ux-{D6}92nItUPv4e^Sa9s*+QbGEZKW{@)8fyCzl#n20nh;Dk zA4oT-aDk*b(A*@*4oC_^527$o!hqD9FN;1w`s6=vL&`Lax(SpzAY(99lGEhorrpg5lnF6Y1n7-%FDQI(?BFpy66&)bmd^dV^N0n{Xc zqdx@U@e50_%+MSPX{9{`O_V^DbAgmYqZ870gHG3h3r%qKiV^#uI0Y5S(3bs2(8xKs zY83<-2KMaF+mLA-XbBDuXK>4%Wc5V!;ZwKAZi zP`CSG=^E7J2iq+KG7MZNAf;ZMSm7kFGEqe4PBz=Px`Jkq6P}>b;Q8=ig z#?mu_Gyy>w3G6qJW5GpZIJnLRwWq+kL3K8kW|DsasQMR%rF&jb+5#tMa4mrn#?Uem zOX~%kCLV&OrJ*4P&OYEW4IKZ-!3B=#h3fEQb^#16{uARj>r zUW`bG7FReDJJ_W^Z^Ns8P{sxu28x!W7_kU;{ZD9H@*${g2lXII>IJ8KXvqjR44T+K zfkyMdNdio=f>-!J%6w46rw^KX z`=KfSBcxpXSaeYsIkAFMJ~Ur}4FebA4~ss5#@)d#0~IE)#0qJJJ_L15p)L~v83qm- zq?8ZJC}6{2DIe6c1RDk_Ao{?y7*fgyrD)IyN*gGcKuuK8_&<924r(BP8ugH*|FY;J zs7(fGNPwpDz^MV`7)U!4+>!t_TR_eT2X%qKU1qQwK%oQfazZ-KD0v=S{fC3*0zQF? zSFmnS`bW*XexURZFX=&bFW7oeVF{_*!AT6c)do&g7-<@m{^2D(YRv*l?vQm|$Th@WcH+J18ZC zM%|%#89ajdvgjgcxD3)0KuI`|77@5704sMOJpt6#45-Bq=?TCBDGa0sQX7G43TXQk z+{=4ebP+sW49zLvZYnsXN`SHysPu>T3E;U3oFLK5I!NmWoEjg3mQ+KE0eE%*WmdIVB@K-*Mkbtl+sNI3^oQh{?0$XAEckmnm9^@$XEegREO zLFyAoPXU%+KxHi?zd-64@VLs$qKlvzC`iu&G|+)a)}R~$=~=*v-f&QTf@P!x(zAf) zBT#PyoR832RzC#8Kn*fT;sU4VhoI#_&=?2TPN3QX(wjyufuQ*XwNC?YErAPb=^=_T~MM5YdZ?;-k-POy$;BT;Sa%SV0Xi# z3ZfTY%ufU71$b0JWl@VoP_8(NlwCkk1ruWr(j5a zffW^Dpy70IIRVKp5U0ReTp}QIz%4O$Sb-@Csvban3`kD|o{vEF3+l)Xq`3vFUtr+_ z$uHn^3{KQYbw(Iy1RmVVgUr<+rzEg3(3}nLmY|m6VB&N&~YVj3Pzt3 zf_E{%r8#_z8mT$~CwOoQMlSEbSrgV=0=H$5(;3(*^vVWOm;VriXBN62qKlyR0lfAAB}q^<2CqH9xd)@_hGiC5GYvA%iIiDD4I+3} zf)_5Z%mOOrA#;=Pb~~uS4az;RW(=(6LCw=(=OJY!P_F}=l|ar!tY1QJugRe2C(vv) zr2c^?V{m=~C1Xf_g1H%zpFj?VF+K}(kqb5S6NLT9R8gBrz<90YPLJO{xX3~!Ku+zih_FlR%?QDNmcIE^DGYgh#X zazA{a3+4rQUmWBMc=dx?%7aZrs(xVJf%m$>=@mR}jn>fzXU~_Ql{r6e!}AQ>Q{e0Z zD{Nua58PAW+yV0xGy_4q;y(o8Sp@7Ua0-Rjnn+m$>?v@Wj*$t#_8|>=!94}G3N&R1 zN}(`Mfin^)5B(5?^d(@P0;NUJFecnnVB27xf@K-Fr%*Blw6TWp6vP&2YZuzh`3R3G zaDI4MbP?<+Se8MJDX?phvJA*m;4A}j2y}kqS3W;{4ESNuMY*FNZ(jh80t<3{- z!oV4X?dVGg4_wV~!g%2BJP(Wq&I0@}9=NF`1ml5QSE4W;xPp*?^57|28YTd)0eJ!d{}f5RGWqc zfR;*w65B^eLWHUWCn`u{dw`Bmd;pV&HxSjLJG)V`QYs?pv;IcAC~HvVd}x7>ENXIE8icK+(CUQgnC#Sgr{Td zY4BswMS0K$7I3TSVbLd)Bm_>B$k7i?l$T^d3mCz9=pkrJ04UW#%PUwag=E=>MHi(( znF^XzAyv&w(6#_jW`~9ZIH7`vXTX9*7eNh2NaOfdKBWG6Sp-@ngzy|V<$}vSs1|VN z9GrZAHn{8s34#U) zK&3RefBq{UKJ>tgoatbR9ki<+oJp`GcI5q*u=WHjy*~tvwSbm0!csUi<%1mrDJY?( z6=Vh^47?y3oWh|gAEFYJk}pEjJjAPEpoOIVAaT%8BRnX>z=bF@%@VMQVaXes@*(QMy-sii3<~RqMHgj3 z=?0wWVaXes@*(QM2_M`$0;z|!nDM4&a0Y;#Pw=wn5_rO0033VJ#0x8xA$jH@XmkKp z`~S*^Ctf~C5J21k%@NSb>>;Sagtd#25-&72K(v4}!%NU=dXUp$B?UMaASGUCz(KTt zi*9(811*UlJw&9$3ym9y7I0<&H|wFcfQq?k;9T%4AJX=E3EIm6mW5~mX9jSG1uO_D zL#Kgr0b*+#N=Z5mT-v`Zx(G>ppfO@-7LfxLD&X*Z2-*$zvgi{3(TBGWK-U7cp%(v$ z{l{4H1E`z>t<8qi)yRPl$q(>U4jw~6JtXWQokFCT8pXcrlzIS4A- zK0(ShNc2N?&VY*lOQ4zKFwi24ixSZ0IHYdCXw!fjMd6^$F&8n-16LBD<`Kj^(4Y?7 zJV;{*Yr=(Q0Z3Q$BdGWB5VWEjDK5cT0V$n=oC|3efy+H;I)&B@@Io0m93T@_zw#jq z;ILW(ZXF>KAlMJ^%m51(cxHfBHmD6gaBRZb0az`8Wejk|^Agm~hE)y7J^@z;zw#l| z5$FjDS}Q=-abUFsTs8d4hje&hmVomEvL)c!0JcRP-4bY}09oRR)e>-4K%9922`+HO zfNTl4GWeAb8AL+21l%%$or!_41ngp5IRP==kCgsF4G%>6M>IOXr5LPt0B#W>>WYV; zNlWmM|F3*V8wk=-gyeKc?EtAqeh7jy1EkEC1(o^C$Ynmbn*c8J#Xuc#F0dS^_3#ih zg#xWGAgy;ncr)Q4Xf*zZAf)F2YDIw-jUoySSX}{Wb-+3eA3;SWQd!OhwhEG&AtRDV zc|;iG7)V1D>KM@SC`hhBD$GUTP69P%K{W=v83HP+KmxF07F_k97qjTOMjT`|O#g2MVCXqfSbAUtD%4SWe+^M?ozaK?fc)vy7UkDw+b>=2KKpkq{! z$_#KVgUCadK&O;^2%OCjx$2Sx zC_Ffjtp%^2f@MB%PJanrT?MuFA!q>*D8@n61R{TdrmmqXKyx-ANu==-NEHODRKWQk zT&W<12P`Lo9r&{7k|@Yw+;E41A{%5VD0zYCkD&SFhee-2$97y|$K6iDnjb;!N<@By zR+g_X9fh`q!IcuE500o^AA&4I$&Zj>9B3K=75QPHO`MSZ&yZmpkQ_L7A|(x^8cGI~ zS6RSbgk%ywL=AwpdBbk@ea;Uh@5u`v_un<9l_%q z(7F@cbV0P1kVZV9aRwRi#M&-`#vEi;8Ij{a(^k;%ftGfVndM*kkm6+Z+a zIT0x$A&ov*nucXIFDhCNc@og!Ff!xFWiB#$ezf$Kt)s0FEpm5N|DU_>oQ z#jkwG?8wU^oKXu^1}+U=f|o@i!UI-2f~`kH?aQJ|pmHC3w+VTF0#f}5>Nq3vBcjs` zP5~(OBT8Nc_x!=VCr~5tBckUIO%q7@5hWjkN;q(p2^r%;j8{NY3OI{GYDAPg4qE68 z8EF95h=}?ants5!6Q2DLMZiN)#~C!I1Ih%5D)1v@XctmQz(*e-V6b;9K;_9tNCJe`ZJ>q+JcEMUpD4ppu}_v z=NaVmhoA;AXjByHIcUofG-!`716-BC@-Ucw30_TuPysH%z?B?Q{)3DxLu~;q)P?6^ zaCQTquK@0>LdtH?`Z{6EZUNVzu&e^kf9P2SoPQ8m?~*jAUSb6o z+R#7-^={yK7_1znA^8w8m^BTYi4b`XX-Es&41|w#fK{SoH)vCB8mv(Z&T}upn-ma1 z2x&h-vn*KUOYlm0gbL7#Vn~M%R9=HrqBmq-DW5mo9(MC%sT ztO93rQ0#!R6G|lxEo9&=WLS1aDQCd7Cs+n4J4+$Ap`m#lG8POz6aYN$j-^J1=HyGD zqu4Hjq7YIoqt(c;+>8;c@ERGMt6zdQP$H5dC~h%wGe!(!uR#$AqETjVpp6!c{EbpXfwz#L=WmqC7*vSCMw8%U<|qjo)ck>0=+I0KT09F) zCeXqTlG%}(%P8d?c>51H>myayD8mQfOaadNh>;%DOaX2bfGcuD6B((0`w-MyfQ>#O zWpI>i18S1NivVzC{t~=14dEAX!vZXWl)+Il7PwY~6#=jeE)FjBGl=trQG8Q}blXlI~gZcr_OR1TqJIB>5UQ4XPG0#K=rR1P5$CU_0{ul#V( z7U_$i6K_6AfE$d+y?js?;u5IE8U`MRmVu6I!wVU3V*yAms{L zBLY?mVMH0c5dki1(A(9Zh=VsGz-7%#@D4dd?gvF5yb%E|YhHpk*&7Hn_Lv3bNu(|UmY9XA0Gox?v*00a zq(TTI#lnjRa8ZNOML=pHfQ>=KEZ9@HiU-KKmssXOK@~M3Kcdvr;F(WEenhD#K@}8I zenjb?gQ5$m-H(W;K4T75~yGV=SQTW8*n24k~=ZS%R!9*@ZuCKqaxty z7o0Oe6#+DNp;W)%Y7d+l=+ZHwZYXnZ1ezZIQr-TIFBK6^d(79ynu$Q z;qd}4!@#)>l%yawpmtHfZ7pzz9F(M>DiCcgq)ZF8?IUP_{vqgGua}^ObGS1tWc}L9 z+b_WN=}YIQ5Zd|?n8sGWf?C0d{ERZ*21@rx?R-Rv2M?Je&Dnr6Z#d}q&x@e(s!x#d zDrn6JUf=+l)V+i`Jp^jVL3-|>$t;xl2FS8|NWTL*X$8sMpxKR2kl7804UkPJ&}m-K z`VP<-3aHuwjpcx5H9$EHc}4}CK|g}l_JC#{K(Y@(caMOyKurYaM^MawBwvD-6@pjV zf>eMEh0f!_>r=3SD2-H51VZ#)gmjOPmIg_J)}(-KcnCVz0AwiCT3AK}cLwk-4TAJ@ zV5@>)$rW5;peI*w4nyRA^hp@>COOy*=rss9w<6~fa9)DV4T4LLhoED1ko*G9I*5+M zB|%Uw0WH3Pj<*;A*pvzc5OMlSIdT{mtm4py|u;2qF7KpnbO;_Zk0L}rh zWDTx*(UUbets~mQ=yR>G{03go3F>=+?1l|#gS+<7792S7g1TNH1?XcjaCL>; z*ap{5A3-bqf8|3uv=2dt?7akSpup8qOvBl~1GQZd`3W(~0LxF1{wugff^GW1Z1W-V z6KKWWN5~35NRa`_Pw+YjG^`6*<%C(2A@UQb9EOakfHru5w(h{jR3KBj$j*d}r6A`v zaN@;XJ%O8INYxXDlOgRqO1IX{?0{EF-|u(ll5`0G*Q#IWP_Em6t`JT!drM zH_m(j8Wuq01H^a$EFVCs(TAW?2bvimvz?HZIjH{$-uMIGO8_0lfb^0uYamcQ053Db zY#l**LeP;G$nszCtR-Zn10+8}?1N;Am!MUKkV+4d7ht277;DF23-};IBbappG+9DR z+J~TW3pR3r$S+81G?02dkcKU+6$Y;5e&s_B=s>USVATjXn_`V=XqrOoZU8l+A$19& z`axRZfHWiltGMCqGH{B+nt8xk11aW%>tSCr28T z^1)t5uU5gC1u@iurF{s_J@6_3oC`s%0BFMsoSo2fAS|;$k|!vlK&oLiF*uWfYvPwh zmp}_Rv9Ah%-JkIC_Brr)!pp;_!Sqp3{{}Wz2r1gYrz(N_GN2)-FahXLIAnJmSX=?L zO27an4jLf12o^U0i3h;ML0u4txC2PM04feo>0m7ZpiMg9Vi;mMWUw5(s~x<}0Tejk zCK7mSC%iZZ>n#B3g_P$IS!gtZErm=R!nzGm!(pirq#bg|F08PIDgcKj)ON_qaQKiI z#0qdqgDC)KvLDbbwontmNeZR_oLFH=390~G+QJloT@UJpfuj^+1uThx9SQalC=ieo zz!C&l0XQAP69uTKgscF9ZIFRo;zV}ZpdIKXmkOT z0&q6oA$Pxg1XUH#ZK>em2s-YJKK2RQiRqy;Is+r0)kUFdKVB}!l8sUC}F+?ZO(=D#2(rhNePj+YwaaLv}Hul(OJ-3@M?( zMKMwgf+xfwV<#Bt#SgSG3!XY5tKC5b^CfUOjZt5tI0e=`1vh+AOD0&Vg*ga5L zVdX8ze#~hH$T=OzrwYKzTW~T4IS$etgH&6PtswBmIj0_&|oQ!&cS2>fUS_`wI@Bnh5wfECy{s(rLj1Qm3!G>bBy4q9>yl0?ob-~@>5 zP;df-ITV&=QM#4j1PBgGu;Z|uoq)X`1=>Om?~Z^+T~W$WXibGwj>1bvP!hTb_arBIN=d@F9w$ zx`)hZGsSm5Gx^tCUUw12M0Xifh3c2A2LHtyb_{1=2}A zko7U3i;gf4>A}{*L_S>y?00Bd#L^Xlr9Viq084-H$`q7)K?(9DXww8JOMyC?AOYN+ zF>w0(SagXUmO>%P4|G@twD$xn6u~JJZ3PCbJC586hLwzv2_jgb2+!rPeI&5{8)hpN zTrz$HZG?il0GtLtf)12`3V_ofw3tO-X9Vhhf|3S&&<0i_BF#|2OF>A%2?|G$7Ff*; zE)dX1Nnr^PewY%h6yyVyf}q2Xu$=rd0oQm9EI&d{w1DI#@Zjc4P}33Iiw8FiA>{z1 zKLghTiV0}`1NUcOdcfm680|oqHh5_Uvkg+=!Zd>hgCI>mh@J4U8@PUO*@fJk0jC|9 ze$b#1)^;9TKXkMV?s7=61=9~6Spv_+K<$Usm~j2zbb)L?I9CxAwmzzGAYA67ZS z^~1Ul2>*j39i|^VIt6aALG^yDia7l%&bATG(;ED-i8hEeMc7s`2G5EH?>>e2Ue!O9!t@&J}fU_~w@k$}SyT10_Mn3te2Z>Rt`rQj~J zz$Fawxn_`LgEXQ9PCFlqE(ycD1x`D-hnQd~3Dziv54FG(4=go7mQx~?DUikRu;xGR zQVo){kVd7z=?l463{GDki!OmTWlLhM(J;~%%K3VbN*7kCL7MciLIqOAfzuZ_#X~|G z($dE`6AzNU;B_6ir4LVSprzM{JPa-?kt#n(DT8yq5;6~hd`KNEeZjkC;I=*T8D!vc z2RQ|TQxu|-#4(-+3k`Ui2^Jvmat9VB@KzEe*T7m;;6w!LkD#=wz^MtjK?6%aut0*R zA9z^@{e1fHW zNHYl1{Dd`wAWMb8%^+CWg&bUv@(EscfzuBnUx7*!6xAd7KfL%^_sM2tyB$n-Gs z7&JIBA&>Qe(-3m>fzuFjnFXoLK-_Kl^-}oA(y=1G=#Ex z1(ZTiT8mHtSmgnk*ac^2@c1aUIu0`a02=p#%}F3tagdmRB_UWWLy{0I|3Q)vEN4J! zHh6*pw{0N#5Z1N<%_I7Q=e97~HZbcTr4Djs4X($Ky$UX8kmDGZPGD9;h7XVmNBF1# z*a4u%6{PBdco#m71$P6u)%n;RXWoaSH!r>O=mp(Um3Oh%gX!qaOP4&Fk2xIunRoK&!`p9<{t(PN(Cfi`^ya0L z%?CgOqL+^T%sU90hQ4t0=A{EiAKnIEZu$UxQq2!Rrsf|CrAt8P-S2c@U|=vj;L-X0 zv&ZVgX%m`%De$-7Wn^GL?3sg=SD;OXY@nOeK}CrhXlULYl&9T5M_{;vdOvQUwN&n) zehl~yc6U&Z#0|7s#2wVPa09K8atHMi+(3I=+QAKKLp#r^soGO zaMANa5PW$V$UWea=Z7HVy8B=G?FpdR{vp@~rhnzPgKMK7f^A^>SAILV=mGg3qz>eN z5DoG_hz9u|M1%YfqCx%#(IEeW=wJEm;9>{le<&YZ>VW(Y;+or(cpe+^`SU}>S66gcyGJ3E$=xE3btRQhvtM~)xZbYy+Xl~>JIC+6e zr4OLnn!w_qO^g@VK>-X-C@+gXfUXt;OMxaEFK~dQz-i}Y(Ff4wbs#CoK8ue<7dSz( z;1u+-=mY3BIdT@Jpo?|ECWDoKEV>{F(hE+pFN;2a zZb$~pf|K;eq6SnO=k3|=RL9*af{j%r-HlTX2hiP4U|F#8k3|m@^0DXwXoXxmv{wEAy4?n>7o2%M7F__%M7BelX&*qBd4OfX z%0CuekOH|1-0pu_^Z|4~0azBS{A1At(6S~-E_zw?0dy2QSQf1OW6=dro24C`r(PC) zfLy@1x~g5aT7Ncsb3AmkJVRtrmE;0%PE zhQP`{7D3VwI0GT460mYuDuLxUcsc+phou8>26|b9h-t8LSWJU65OQ3CmH&jqCFE+W zkD%EcSbQ)rFn|&WG_-2!v%fa7G5_5abAiWjAnE zg=HvsW(3!9@CXE%j2wX=!{GNO!s;|9@up7krc4j+RGw% zy9Hj;ftoJx+6q?s!q!JXTO*)^irk`rWj%0<0-PU^vn?#^feT%5en76-VObAcvxD;k za>j&ZJ#g&|$`418Ygn*VAE7lYEOWtYNw8HvA+;pDwql0WRv@cD_s7BV6{uo@=POVO zatG%tSP1xm^A)H}a|h=uc>V$BD{%e+=PP(+0?t>U#tbB1!P*;;d(|XxKtnNVWpG6w;MIE`!0@>SGZky1)q^xs(Ox zLRcvau3?ZT{UVv>54t1D16z$+wJOu(xvu!q3e1ldF2Oa$`~I3Xi@2%KkN z9)c$1k1Vh>1kRDLt_?hTK)C~P+X}=(;FO9SLtvZWF$B_r979l>!0i`!6AtbnSe5~M z2$m^e39Jt#%fK=PESdFz9Rl+ZtO*DA5Gc#Q@9BZ`n?M~Kc)kD?4v>5SD>xzf0^I%r z=L>lK0nQiT`U9LVV9xUc=L>N92e;#3EnYwHUfBdt`j-T?Dw)7x44uOLC<`sa+(5mk zi=bK$(!E9sQAh;@P9Na(jZ}O?atgT10#4k>?E+X%0o7=bE;w=l12*X=q=11e2!y9+ z#O{QT@ahR{I;?sECurnm6D;e1DrZP<9k~nz=R;Vp3|uQ9XG&NF1deiW{z2}gfUSb{ zQososxd{TUfMDeyI6=c(?MNH5WuQR}87}}IUiK>=-hP8MBjA|{)RKS<0U#IkV7p;0 zG;ktDt_h)bLpoyMM2wu(z?lh_GNFn1Bc!+nEvSQJk&p1&44jc+wHYYgf@%g>uNrb> z%*P@~#~M`BgJj|51=wY<0uYvg;C)T7%V2#?SO!9E4E}`l`QWt*D>$`62Qwi1&0+Nl zs9^@rKcJ|G%B}GH11nV^`3Ic7z^NXokpjA<7wTbX3WpYfNKG_&yg)P5MNpTo z9h|66k3K0l-dcF zL10~I_|OHoK?lnqumZjh>~458fwJvK&;|lXbEXaCZqT^wOXe)tya1%00oe`DC!k^+ zl272t6O=Oj4M6E0)Drv10gZRa=q#vO0FRsbgGOf|bqv@HP_BTFF+u7Wa2pHN3{cw( z-o^qs1m4C1j~0MTg152Y4uYBk%L5-l7k@xK3C;t*phLDFAtQQ_>f?tXY8C(|J>*CR zXMvxPVK8_ihj|u0HVm!-U~^)Q3MNg6pQz%E5`0iw-~^0pFgPdv z%7+YL!GZ#ukdfM6U>CzuC@kh6l>^LRSQ`i)#vm^s&ewbCor0Q9K@A^xIt7)akaP-5 z{(gvbDuPlzgIYl!C7_ioBGrPf&q9o3LP~dVD@X{W8`3a?wSuHzx?J-dLBr}d zNOJ!P8Bc~MXf9~(g-?EflgUR&MFlUl!LEW8hp+?<3ut&>1Jsu62d5R}p#yO80G}`g z9XbG~fAEQGNX;5pZ40>y=_h0a8B%H^m!6;`aTI*mE7UM(odAskP-;d_xF|USRN#T4 z5>iotq5+~D9u25<0yv!`#RMde{Dh3DLvFc6i43S=@W_A$1f=?frF?i~fI|fy86dqM zK}Y&Ri!D&*#~5z{O&dVcJ9sh0Ptd#pToZW8`DGEj1q_)r11C>L;J;;Dm!bj{z>-e?sOlAV%{ovIIB>d<5O>2udrUb=;uDgPgy>{)Xi*aH2L6R>m-Z*PN2Tz7D~2W@dgv^YR>$Nu1S z4{mY5%X3h>1W_A+=as=Jj^pT0$lMgjZQ%9_yj=ov5G4Ko5Co4FLB>nLxfwY}fYUZ| zQbrW>;Pio%`XI$T$nB7enUPZqIQo$z7@X{35e$xPB&R}(d8Fny*tOtT{~-uj(+5k= z7!C%<50Zo7i5#5$K=A`AO5n9O*b}fE15VS(&Q3s2m#{<*ZVG^tF|xD4zC>!CfSnC) z!NS@-uo4@?*|0<_150OM-@@AVutW=YHaJPcoDEB~u=LOmOSG^C0%TSm9@-#JAv+rs zWN70wpeb6^^vVk=X+hP_NzkMmcytEVKZdkYz{M&%NEHvX9E9wQf{kZDS}EY<11`;B z9RNrx1>F7x&8CBkE07)V)(EJP2byez6x^VYhqp$!p*BGhFLL1zYQsaSEocgc&m2Pv zZqQf~q@0GvH`q#W^dilSLy|P8+<+I&kj4gVKoWdlGo+A*EY0``x_25|(G0FyU@-=c zK4<|5wisLpA$Q(jB{`^=fwxCs6&<8B#+k^$$rxuz4o=h1@)B$}I2j`k9)UvvTs6ar zX5z^cHdGPsq^22ur2{h*m*`F4T>66scpw)SL)!`Og>l9F(lV;~LSy;Qk`?9CpxpH+KP#=Hm+B1KfXt7QFm? z?fw9Ml>2>9R}-{f{8zp^I9Y<$+JmO*L6^dPd<`3v1a-wAqaRG56w7<`=W9^-L&gR{ zQ&-@04N8%a)&>hm6{xHQs{*w);M4bzwm7)-2B&^d5d~6(S$cy0a_FRszE``3ea>qDC{6Bs(u!MmwLg|IVjR5v5jZjagsk@j`zQeve@NL9-rNRH4S^FQydw@^q0!xgb zwhh=iXsHMvi-L?T!e(Q^%~s^J3rmdfA{tz8{e(;zLi%ISoQ0AYg`xQy+;V}gA3>x; z@RS^Agc36yf=fD3+6D!dJE+tFj}m~K4(WBl8XAx>G;pM_97RfA?x3O&yiOOij~LuH zgG@>wPI3SZ+k#)0ZjupUt8 zeT20jA*C2n5cny823x_^3%ERk_IW`%`=P%3$OmnHLP{~DMRt%958TKBrz~jTfi=V0 zdi*fW;PiyFKmyW+1XmrfQVc#23NFPyg6@`qHY%ZQJ^1=HND&Asmm&9_Bj;UkDF$uP zfN~!sg(0mRg!H>W3928Q);=P2V8JO2TCsyQgHzbcBE${>P`-pT8^QG{w8jJb8CGh6 zN)51PaH;jO2-LuT4Q@w58#CZ_xbV(4yh#o#wIHb(*1tv?aEFgBgIl?<1P5BI04~2_ z2@W)f56%ss7=-n&{lP<$AhnNKY^?L(&th_k>7KtjN&|Nl&nn$sH6Z;5{wiIublK1Mk^^lI}%tnulf} z@Wu*|Lik9dAS}SZsR(-lgQX`>YYFUOaC$=aFgO+cgzTOH8xJkD;7w#VL}vq*o-=i0&P>L+Ah5ZHKVx`J1PNTXBWMD-K0KLuPUffE&S zUmR9Sfod*LJ_Og<&~XvyI0}3}DY*aX(R|ziJf89^-~R%rjRZ;#A3;l`!a*}4WSo%LFm^O5c;thguZR^|9=`{erR<6 zAh_loIr|4;d)R;8_6Hx~E`RjrZE^6eU(l1H7?1wE?XLhjpjr+p3^~4q2_$X+5|@RE zgI1JZWCn>lfW&2B;-I;ii!31V0FbyeOdPakCIu=!FLHpSCV)mN9u|Fsc>=yKniC`oE-W7wf$o2LeHnCO z4p=$(Xr!OFA)Sba;445OvS8)llM;U3hIC;cf-1?E5Ls~X3f`Ff^ERYs@eq73IYbs* z)`CxV_<0*rIX?uApFs733tsR^YCms7){Q&_jY&aep{4Leevq5N#q2{+lL9IWE{MTf z*ni%J)aMUD!>LeNu=1Bh7X?9j!M&A-pcVyG7M#4n$4mUY4e6jfEc(a`4MzCxY+;aI za2FVS8x>S9xX1?Yl>d1fQtLkimETZVuyXLpTR(3@h72Eq7GFSR!3|IF!3sZbL&nL$ zC#^zd!R0yl1RPN0f+}I~fv8YfaN!Q#kpJ^`Ke+S;pIZu*g_iUeB|&ZmH$cHB`a)&F z%3p%c)dj^b)MQqef5FOM7G0DE=>-?^4?%kqpnAd80Ql?yQ2avmGQ;$OmA?d?TL_9@ zs4Np$7JMWz=qw2>7p8vr444nVq`3W>81j$d} z{0GlB;QR;9H{kpS&m7?V2g)4&;QS{Bb|dWY5-w0a1Lr>xm^e89@qom^`A-NY4$gmk zpr8V`N|2Id7&!k4fTX|~3OU7r)xuI7I71<)DX?-_ngVAi?sxelQC0hJX$Z%>D2s88^npkU=cZ%+s3C8SISD*_>h9)Qmp z`FR`Cws~0e5f(v6M`IvmGFUXh&qM&l6r?qcRFc87ASk}znG6B2<K)6Q>H*6>JQFm+%Wavd?pGCEpXmJ3N1)JlK@G9GtNWMJQS$30-sg`JMQe~?di~r z15MFCZ%+s3m4`*2K<5OU1|5*Zcoeo+7pZE4r(mdla47??+F%tiqrRA90sQ0=i$QYJ5XANWi0R^fuM2^(%uvRyX_<7euywoxfBkn$v%NPLZHDV&`{oO zNLB-x67B$24?E%wYz{cTp_=1x^ylqxP<4ecCjdD=fsC3C&Uc`E1v`TYWLiHsQ@~Rz z%wYwf6avm(FG1JuK?bFv0}qI1A=p?*&VV!z!TA?_S`tzoLUAhOd`hJ13FdL=JQSks z2R0m>mmqG2pW_2|HaJn;0yyb9F_~ghJ({9hT&kh z{JcFKnqJ{S4cT9X)QSTe4oSC=U;#T87A)XIiD5W6YlDh3P~JgyIoOd%O*OFLpu`DH z81Nbp>~dHlM-6kZe~=P6D9pi$9ORY56JAOOfYb3yVP6Q%4e3{7q-#(v69*j>!~?Y) zR4RpoT4>O~_<1`VRI7Xf4U&NiWYBHyexOtisZ~&%6{Y}6wV-y{C&=ddpSQz7Y5Nms z1OTj?19ZrX0i?|a9^iTT`XcE3AW(9Lj2VG@E1|>~! z+6Lt)$UM!$imFN?r! z9*|A&@*2`W1IGgRkVa5wgBmR_K^F;sd<|9$OVf}}6L@7RxataODgop+c$!Ad zx}fwA>B)mf@FDpRGWvj&ph4jXPtdTige7~(MUBvi1Um~9GoX4J(gA>mDm?vz6Ew60 z1SU0gK`FV+#bE% z4{95M(=k6rxeV$!fI<<{+7|}Pe=NEPy5||v`T&hrf%6zx6*wJ3+HD^}_u!$VW6&uL zU{#=W3>}O5SagvSR3w8+X-FxI9ChIKz{4WwXcu_f0=Vk{Dl3o@AjnYoI2EMD30~<6 zOW5GZgT^sfEx1H}So9GxN(WvI3`^MH66H&7JUM>n8CJ!iyC;#^<&XR&<*eKWDZW%FF`kgfdTLRgBQrd&PhZ{$e`6b;4p?IWR$WEx>ysDkU^;voRC2k0P_Bg zpZTJolLevk1{03HybU^;7JUsSWS9vuEpP`dAB2O3k3NCAqoCP$kQX4$ZOGULcrg@c z=oj2Rd0F%cP9I7!@wmzIL46^1~^X9(-EYB1CBrBGzU)o=t(pT)WCs{2f>pwC>B6dd7z^3 zB&fX)PVm@s3OMP*s{o_{Ae0;ks~8~H=ptuWaB6=D+F=FF3#{;QYNRq8R2hMy1wPCJ z>6}B$4S3c8Cw-J+22`-ZhsU8s61;{0Cw=5n8rH`|Dd*r#CUDY6uCBlh5qNJOT6n^% z3~&Ziy`m&VaBBqIJA?%cq!o=?or7A} z;o$TNFP}lJXw)hTDZRpK1W2m`yguXQYjFHSOKWT`6mWWlHw;0e&YL}-H2AWCS01Tfkm;DG#j8$LY?YOY=c z^*>;V5~Y}dr(sy41odRVIRus{Q3?=v8b(i);6-VWAcd5osOvJ((+o0$h52gv=a)Y?%&Djp$tv&>TK=7z4cg0csIAHDXVwuyhF;9|A`J z&e9XKIvwmIW>EVclzbs$>EPulP>aB66TQHKBwKK+0h%Mg7J-XS@Jc9XHib46P_rq# z_X+CKgGzPKz$-XmflEvDY6;$Ifi)E1%@A;D30^w|wGNsv(FR$;0S_8v1*LN2f&rE= zQHySHw7^=tNTnqkXvQD34Gvyfg5nD{?&fTQ-fx4Ho7@=b6D1oU(kCe8fr=GKZw=HB z0|hB$cnvZ@2dyk1sTDLv4Q~5`^?+-4SW5wtD$#=xT&%)tcTm~@>4c9vfZCuS3*l{B zlt6)`V32=7txQmd6Ra6rTD~m$1j~T%X;zdx2q|h|sS*@>V9nrE`LgH}XbU6QW^l=g zUhzW8UU1n7trx+X!Kw0P(I?P(P+-mAw1`oHulGdLAK1Z^{a`T8#HT4>=7St$%lh2VS%UM&R5oG)L4Td1Jq4c|ZUGFl&dIs{FZ$RB<98srDaFc_$e z1$h9{-9qWgfKw&Bw*}S$uC-w)3{pCR5;CaVf|QOZ6C2=hG)OuGuUCY673?YGSOd8o z)V)ScFQCCT$b<>By$Esvq;y1XsDV@8N62ackUQYx0w_&;(3(-`4F5;ax=W~2aHc-g zbPsOk!crfo0tGt-mil0=Zb*HN9I4=v@L|y>)Djjn(g&%m!AtOqa9_!_F4x7{Cr3VHC>bAh7|Utc`>^7Vek|8z3v-WvCF;7I@kPtvCZ& z1Yf#>(q;x1Z zynGGzFf7Fi!#oU5u`i3j!{J~LgVQNSi3%EdnGa5}4~ss7audiq^T9>r&)d_X3H38O zDA++2;OV0uUxWPzE+WD03aJ0k)2SG!ZUz-$AUA_bMK=e?cC?a{jZvZdx zyLsDv0VsKbo1GlsYU(gZ|AnJJU*7_05dkIQ2Oygz z!1|DrHE3uO><;kCzz#(DEUk<^J%g5+fT~fDe<5q7S-^(=yd4i3*8L2ckq3nqqo)0dJ9~ONEtxo`{hbLpuYH+X? za7*YTXt5R~*yn>A01t~kgL-vfEwIE3S}Y9K0&bRm1TDpYXn_yNf^r$?4h2v@60&9r zvJm+8d~j-psGbilX&)AS2CXmvTL4bC(DVlCOKaI$?^^cge_4b}oqt5y}o$#;ca)& z+Vc;fWXj391E_4nPeAuohT40IduMYXO&R;0l)&ffmpfd)*TEL0nC+Kivs1{ga0OfkH7Fc2c zZGr=9ffQJvjTsOv@Fi)WW)rBOU^@(|Mn~~z2#kgRo)BPXU|>MT_6!UR$he+?fdLu+ zXJ%kP!Tbyi|B>;3c832bSf1hke`IXW@c%zDu4nlF9~u8={Qn;XGynhp9~ra%|NkEu z^Z)<k)u15rJdnh$K-Fm@iM@b|A;-rbsMr%Ew+KMlY)E=lpkk~@Vir)b|4*c_%Q03AaUli<1|}r) zVP$|JOdJ}PuyP;(CJwa%Ru<$##TD2fR>0C_K$QXm0|Tt=*a#J$pa?MsR(>3V ziWjIr#9?K~ZJ0PT1Hj4=&|W=|e+#rA>S1LGD-*=O6LcZsu<}F-Dt-W3pu@@(kpDpD zd@zQnhm|WXQ1uSbJO(RUBBA01RuJ{D_|1okFR+1#!^)U8sQ3g>AmXrc2NX^qe_im0h{MYsW=QyK2!@Em z${%&8_=IqXIIIi;@t7H)i5bK}!pscN#Ec{cVlp#86Elc|gqaz*k$4cM9k{H+G{*%h z4l3IqDj}p7R9peVg_8kLacK0y#lxWDpz9amf(#4{F;H<(K1LQ#f{G^~3oJ$$6$f>%L3|jV2Ng#u`oL04pyHs;Hdp{ktb&Te+6ho$h7C}0P}dtK z2BNn?#XB3l6$kabVPYWq3RE1_ zwgd5D_!d+g)SHKif#?TNaael=BnHCIpyHsxA`l;j-$2Dd0|_uO5d8@%4jOR)@sTkz z11B;U%>Kd5z#zoH$S@%q%w>QUa}4|}5b*<0aadh11r`4=38EfW-|Ioe4NgPE;dOp- zW^PJSW=TO}a&~53x?VCve0;E{uS-q;9_v=M%3WBk&l z_@&M8OPk}DHY6Z#M8I)I1YBo=#d>T;V)F$yrx_YxG0+fO02^9jQE7xNri`#@$Levc zzBR(;b|Y+#H?lx;QfVG2b)km`MAQ&HWTC?721A9h8*PLh{t(p|{(uUj`vfYC9+D7Y zWAvzi3WI}K&&S!&fB{4rfk_iEX$B@OK%@~^)(9+X1eP@d%Nl`Yjlr_UU|D0Ztg$&r zm$4;?G%*B|#$XaGZUPoJ1&f=3)tiB3&A=v@LBzpknnF~9Z7~JwH3i#iY6>#V)C4&j zLp=me9-z_y!e;<6LGgxE0Kml<7)+q;LQu08)P4B#9|A(4?Lv^a4g&)NOuPl!E(D2# znqDyRE6{c!NL&v|y$Q5k2ol#v5}yKX7lOn=bvewO5@@>+Bo1nm!^NQmCP>@}$(#~s zyAUJ}YTCinPXRd&+=gagFhNp(1Y{Xh+!RUt3rI6m+zd(F1==nInGa&Z%&&m93qj(@ z;kE?YE(D2#x@a) zB#!K_E6{c!NF3xexcSg_AxPX7$^0qMb|FaI4oUnJw0;7$?4VHru7G4>z!U>WJ#zT0 ziARVdhi6SPLL3z2F!#JkLx>~CuT2(0963Go0g4 zgg7V(z{D%s5#pev4HLi7i4aH5rxHB~aW^FQB!D`o&~)yOBwhe*z=GTfN@6hctDx-< zkhmw3`hQSySbE?9b?6xw80?_!5|}t_+(qRq!ad08YRLtJIC47Vxq=W!PPbF8BgB#O zMb0gRIC6Thxr-1-&R;ChhBL@t$ocWgBZPY7bi(r(A{_dG?2Bd4E`7YK3W^z-Ev zLL51r)VxKABd3Qa?-An2>A?ruzz4Y>IeuF{A=D$sugqtJICA`|d_{;O$1g+_GBymd z;{i0jVLdodbc4hTpyIIp8%#U{nx3H52)Nq;Ql9`7hxO}V>LZ}(QUhu+Ok4n(jzM`D zBn1<{08RIxxIh*!faViW+#!oMK=T#I@5tf|(0mB%AHvLUfaY7+crQ%+1~i|;`i(I0 z2xz&G0Wusk?hMUi6QJb?v{-_N!wG1)1C@r0AApKOt%r+WfR){lj$FMx`lKvSOq z6^9lBaQ7!b#c!agUjeOGpxGC$UIJPVL9-oPTmUK#%`R|p4yZV6yd36E2B@4Uz__ zXMl=ZfCP}#|2PE+Fj&77Bn?vk0V)m~&xeUGfQrNVw=i)JXhcPT1dz=?3NatnPXUOfTTg@ORR&4?*IuP zsn=Ky5r>UyfTTg{6`J;2Q={lsQ3>waSmv^6E@WZ zGk*cJf`ZM%z{C}x>0AIj9*Af!c|gl&SUU$~4G2SW3#3m9VuQp$7~FwkU|@i?Pe5WI z%mAHtf>lc(F%X8$J3*^+uy&9*Y~HB_WH_`90uqPKJ0WLpkT`7KX$`u1=)4oG9tWuf zVc5JAESNxIAPk##f+kIIM(2*c)` zj-Z(jn|Feh=ODEp44Zd)gQgxf?*tA=q;P=EJBffaBZ7aD%hX>zy`6MjLMu=Wsy2U+g}V#DGQlpH}e!TOybIS_7u7G&T> zYZJ9S{eFI36 ifq?-Qf1v1vIuR-Zp=Pi{Y%&KW4u}w#gouKfU;+T|!`in1 literal 0 HcmV?d00001 diff --git a/aux/simdcomp/src/simdcomputil.c b/aux/simdcomp/src/simdcomputil.c new file mode 100644 index 0000000..9b36da5 --- /dev/null +++ b/aux/simdcomp/src/simdcomputil.c @@ -0,0 +1,56 @@ +#include "../include/simdcomputil.h" + +__attribute__((always_inline)) +static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +} + + +// returns the integer logarithm of v (bit width) +uint32_t bits(const uint32_t v) { +#ifdef _MSC_VER + if (v == 0) { + return 0; + } + unsigned long answer; + _BitScanReverse(&answer, v); + return answer + 1; +#else + return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft +#endif +} + +__attribute__ ((pure)) +uint32_t maxbits(const uint32_t * begin) { + uint32_t accumulator = 0;const uint32_t * k; + for (k = begin; k != begin + SIMDBlockSize; ++k) { + accumulator |= *k; + } + return bits(accumulator); +} + +static uint32_t maxbitas32int(const __m128i accumulator) { + uint32_t tmparray[4]; + _mm_storeu_si128((__m128i *) (tmparray), accumulator); + return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); +} + + +// maxbit over 128 integers (SIMDBlockSize) with provided initial value +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) { + __m128i initoffset = _mm_set1_epi32 (initvalue); + const __m128i* pin = (const __m128i*)(in); + __m128i newvec = _mm_loadu_si128(pin); + __m128i accumulator = Delta(newvec , initoffset); + __m128i oldvec = newvec; + uint32_t k; + for(k = 1; 4*k < SIMDBlockSize; ++k) { + newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); +} + diff --git a/aux/simdcomp/src/simdcomputil.o b/aux/simdcomp/src/simdcomputil.o new file mode 100644 index 0000000000000000000000000000000000000000..6957fafaed1a277a6f7ea868e9bd5fc899786e39 GIT binary patch literal 2416 zcmb<-^>JfjWMqH=Mg}_u1P><4z+k|JU^{@B4h*~uTnvT>TK|`D@$db=L81A(yylO? zX<(qoFW&-I<*&W{w_)k?=&yW%qd$vp@OU&I z5IFj=_!rC3kFO6LefjIa(TA^Z9Q~Pp$iwnO>CK}*1Y=YLx?NO6xLs5PxLs5vxLs6a zN_hGAUf7|)z`y_wHIHr|6^#@RV;->MN?ARck60Xj0nP^Q&dyc}8lg$0c_pO^CVG~7 zhPsAkrC_d^CPVX-Q^|UNS>c zW=SzaZej(P2FXF$DTWO3@!*h;PtM4WPfN_q0UM(P4JwdzJPZsBfBr)N%v~UH5C$t| zU|=wTGC*P=3{&TUCJu8yRG4fEWIiYjpoc7^PGMkBgSsDP0Vup-Y!EHYzyOYQn0k;H z2w#9kxde#Az`y`X?I3$#F$)t{V_*Qs9-1`_Y3Kq_b3kT5nP5tXfq?;JCc1KB^&3L% zNB2Lm`a%8%xe1oDYC%jG?FqFXB!-N;(LDr}gz1BF8RDS&L1_gh z2BIfG^(TNhNEj4WNL+?U9P#%88h@~K2Z}$CUJwT9hs7T#zd@}63B&ORs9*+M1j0af zKZuX+c96;lXi;JTQoz8#0LnX@(6|Q4!|Vr%L9GG_!|?>D`)%MN5C+sZ2#0}z0RS!v Bp0WS{ literal 0 HcmV?d00001 diff --git a/aux/simdcomp/src/simdintegratedbitpacking.c b/aux/simdcomp/src/simdintegratedbitpacking.c new file mode 100644 index 0000000..82e5d19 --- /dev/null +++ b/aux/simdcomp/src/simdintegratedbitpacking.c @@ -0,0 +1,24863 @@ +/** + * This code is released under a BSD License. + */ +#include "../include/simdintegratedbitpacking.h" + +__attribute__((always_inline)) +static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +} + +__attribute__((always_inline)) +static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); + return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); +} + + +__m128i iunpack0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + __m128i *out = (__m128i*)(_out); + const __m128i zero = _mm_set1_epi32 (0); + unsigned i; + for (i = 0; i < 8; ++i) { + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + } + + return initOffset; +} + + + + +void ipackwithoutmask0(__m128i initOffset , const uint32_t * _in , __m128i * out) { + +} + + +void ipack0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { +} + + + +void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(3U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(7U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(15U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(31U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(63U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(127U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(255U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(511U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1023U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2047U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(4095U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(8191U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(16383U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(32767U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(65535U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(131071U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(262143U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(524287U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1048575U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2097151U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(4194303U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(8388607U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(16777215U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(33554431U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(67108863U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(134217727U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(268435455U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(536870911U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1073741823U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2147483647U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask32(__m128i initOffset , const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack32(__m128i initOffset , const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + + +__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +__m128i iunpack32(__m128i initOffset, const __m128i* in, uint32_t * _out) { + __m128i * mout = (__m128i *)(_out); + __m128i invec; + size_t k; + for(k = 0; k < 128/4; ++k) { + invec = _mm_loadu_si128(in++); + _mm_storeu_si128(mout++, invec); + } + return invec; +} + + + + + void simdunpackd1(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: iunpack0(initOffset,in,out); break; + + case 1: iunpack1(initOffset,in,out); break; + + case 2: iunpack2(initOffset,in,out); break; + + case 3: iunpack3(initOffset,in,out); break; + + case 4: iunpack4(initOffset,in,out); break; + + case 5: iunpack5(initOffset,in,out); break; + + case 6: iunpack6(initOffset,in,out); break; + + case 7: iunpack7(initOffset,in,out); break; + + case 8: iunpack8(initOffset,in,out); break; + + case 9: iunpack9(initOffset,in,out); break; + + case 10: iunpack10(initOffset,in,out); break; + + case 11: iunpack11(initOffset,in,out); break; + + case 12: iunpack12(initOffset,in,out); break; + + case 13: iunpack13(initOffset,in,out); break; + + case 14: iunpack14(initOffset,in,out); break; + + case 15: iunpack15(initOffset,in,out); break; + + case 16: iunpack16(initOffset,in,out); break; + + case 17: iunpack17(initOffset,in,out); break; + + case 18: iunpack18(initOffset,in,out); break; + + case 19: iunpack19(initOffset,in,out); break; + + case 20: iunpack20(initOffset,in,out); break; + + case 21: iunpack21(initOffset,in,out); break; + + case 22: iunpack22(initOffset,in,out); break; + + case 23: iunpack23(initOffset,in,out); break; + + case 24: iunpack24(initOffset,in,out); break; + + case 25: iunpack25(initOffset,in,out); break; + + case 26: iunpack26(initOffset,in,out); break; + + case 27: iunpack27(initOffset,in,out); break; + + case 28: iunpack28(initOffset,in,out); break; + + case 29: iunpack29(initOffset,in,out); break; + + case 30: iunpack30(initOffset,in,out); break; + + case 31: iunpack31(initOffset,in,out); break; + + case 32: iunpack32(initOffset,in,out); break; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: break; + + case 1: ipackwithoutmask1(initOffset,in,out); break; + + case 2: ipackwithoutmask2(initOffset,in,out); break; + + case 3: ipackwithoutmask3(initOffset,in,out); break; + + case 4: ipackwithoutmask4(initOffset,in,out); break; + + case 5: ipackwithoutmask5(initOffset,in,out); break; + + case 6: ipackwithoutmask6(initOffset,in,out); break; + + case 7: ipackwithoutmask7(initOffset,in,out); break; + + case 8: ipackwithoutmask8(initOffset,in,out); break; + + case 9: ipackwithoutmask9(initOffset,in,out); break; + + case 10: ipackwithoutmask10(initOffset,in,out); break; + + case 11: ipackwithoutmask11(initOffset,in,out); break; + + case 12: ipackwithoutmask12(initOffset,in,out); break; + + case 13: ipackwithoutmask13(initOffset,in,out); break; + + case 14: ipackwithoutmask14(initOffset,in,out); break; + + case 15: ipackwithoutmask15(initOffset,in,out); break; + + case 16: ipackwithoutmask16(initOffset,in,out); break; + + case 17: ipackwithoutmask17(initOffset,in,out); break; + + case 18: ipackwithoutmask18(initOffset,in,out); break; + + case 19: ipackwithoutmask19(initOffset,in,out); break; + + case 20: ipackwithoutmask20(initOffset,in,out); break; + + case 21: ipackwithoutmask21(initOffset,in,out); break; + + case 22: ipackwithoutmask22(initOffset,in,out); break; + + case 23: ipackwithoutmask23(initOffset,in,out); break; + + case 24: ipackwithoutmask24(initOffset,in,out); break; + + case 25: ipackwithoutmask25(initOffset,in,out); break; + + case 26: ipackwithoutmask26(initOffset,in,out); break; + + case 27: ipackwithoutmask27(initOffset,in,out); break; + + case 28: ipackwithoutmask28(initOffset,in,out); break; + + case 29: ipackwithoutmask29(initOffset,in,out); break; + + case 30: ipackwithoutmask30(initOffset,in,out); break; + + case 31: ipackwithoutmask31(initOffset,in,out); break; + + case 32: ipackwithoutmask32(initOffset,in,out); break; + + default: break; + } +} + + + + +void simdpackd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: break;; + + case 1: ipack1(initOffset, in,out); break; + + case 2: ipack2(initOffset, in,out); break; + + case 3: ipack3(initOffset, in,out); break; + + case 4: ipack4(initOffset, in,out); break; + + case 5: ipack5(initOffset, in,out); break; + + case 6: ipack6(initOffset, in,out); break; + + case 7: ipack7(initOffset, in,out); break; + + case 8: ipack8(initOffset, in,out); break; + + case 9: ipack9(initOffset, in,out); break; + + case 10: ipack10(initOffset, in,out); break; + + case 11: ipack11(initOffset, in,out); break; + + case 12: ipack12(initOffset, in,out); break; + + case 13: ipack13(initOffset, in,out); break; + + case 14: ipack14(initOffset, in,out); break; + + case 15: ipack15(initOffset, in,out); break; + + case 16: ipack16(initOffset, in,out); break; + + case 17: ipack17(initOffset, in,out); break; + + case 18: ipack18(initOffset, in,out); break; + + case 19: ipack19(initOffset, in,out); break; + + case 20: ipack20(initOffset, in,out); break; + + case 21: ipack21(initOffset, in,out); break; + + case 22: ipack22(initOffset, in,out); break; + + case 23: ipack23(initOffset, in,out); break; + + case 24: ipack24(initOffset, in,out); break; + + case 25: ipack25(initOffset, in,out); break; + + case 26: ipack26(initOffset, in,out); break; + + case 27: ipack27(initOffset, in,out); break; + + case 28: ipack28(initOffset, in,out); break; + + case 29: ipack29(initOffset, in,out); break; + + case 30: ipack30(initOffset, in,out); break; + + case 31: ipack31(initOffset, in,out); break; + + case 32: ipack32(initOffset, in,out); break; + + default: break; + } +} + diff --git a/aux/simdcomp/src/simdintegratedbitpacking.o b/aux/simdcomp/src/simdintegratedbitpacking.o new file mode 100644 index 0000000000000000000000000000000000000000..4c334334a31aebde1b14ac551da8f993a31bf5c7 GIT binary patch literal 152584 zcmb<-^>JfjWMqH=Mg}_u1P><4z`zl}1m`$_Sq=;W47>~;z0(+&7#Py{~3M;{j7<~e$^=nLb~kFSp${aE~q<><%Pw~zkH_cu8D^4IO7FN<&S9DP{ynd#`y z*N>0>EdI%I^ylkKM}OtJI~@J^>(bGW#W#75zAXC0eDvn)TSs3O{{UH%9&q&K>yt-+ z{yGV9&BvmTEJq){zIgN_$VI>M;}ed41Uu^h)GlL3SkvvMZM_07;`eloQ#Z^EZH$pu3d|*{#zLfRvyICpU(3FMyOGhbSlx zP(rl*0Z0k5b9qsm`vD}4?p!_;=YmovM$q!3I9Gs?fdM0Q1u&ee08)nRTtSS`H2^6= zcCHYLa~(j^=*|^Jac%%e33}*?pg1=Hqy#;5MKPRP08)nRTrmviHh`2MJ69aVxf4Lr z=+2cuaqa?;5_IQEqBwU0NC~=gr7)a(0Hh4rxzZTUy#P{z>|7aybJ3kC3wGwO{P_>S znm{=nVf|9pM;=*wRRj{YqAAP-J7KLpvpB@DcL zIr=MK{^(6ec@i#g^drdNU-|37>GWs5DX2_4`tkJ>NU;GgO`zq60yuNtynXEG!`t^j zMF=CELn*HC;j6&xP*(esFn&NL!Gw=f>~?xI9Al0uPE8 zuoO5DFYrRW@E26r&xaQ}7k?c%da?K}&(WWd@aKbi;SWgJbZEiz_UO%DZz0A0#oO%oN;sEYcPzZ_gLfh(?`#eYHd8K|22!*cZEuagk>f4qI}=+D=0 zLFr5w>|9XY1$C{+(O>!V!3EdL*JqFZ%AXD{p&)fCD1U$=3G7IaKj7~F`T8~}wTXh$ z*2mXxV0l99=&$^KaNz}V@UQ&!;8F^l_~7Xc>`0WD`uX}KD4&RfT@3OR$iW|rK1v+@ zmA@QZkbxZhD}O$?#DYW#sOAOv37n!(q5$OLi$z~0!R-N%r*1+5TngknaDfJL@UQ%S zaH$3M64-+vKY<;I5evWa+u_C6!=lg9u$YiJ3i2g5VC zhp!JnTT`G$)Wg@8!08my7=kvUIKXYIpOB~mCCc9{M{oW*3CS&>X!=<6kqhLk09a18J8g-xs-A7P63{(zdb7DWFXaZFZMW6XVt`vZ}64D~V5$Tg2Oxpo|O&I*=289f4G%pa6xH zOo%j(rES~}E^Xj0cv9w1ku}sP#h82M%a~3U0|m3La3Z z00k~AiDK3V;L-)zxtI-4aG`_jTs{=%f=d*1=L(=W7hKSwJ68zBx!{rn-MJ!|)fXdL zvmUc02QOQ|CCSU8PZ9{{qB~Ox>`c5(BN?zWaW#$Pz#038pqs$q2}ghBi$lr}SnCJe z&jP2EkFTE`efjIjQBZ>i+)%m+>N$ZsTp%e>9DzGkkcfhmu%J#AsPy>-s)Rtr$WIn< z1pzMqL5&Ykl?Lx>fy)$VorY4bKzm^iL7Dy`a=!~wxIpS`P}(_(+$Dszm44-i7r=^3 zP~tg&Q4@oU7G!6G5)cv2Ms&X(g31}F%hAgkJdTGJI1u-P5)*d!LwjJ5IsufRFuedS zZjikIN>s#n0bJ~Wat62@f_gy;R$M~MADmu*R?DCShRx~FW)dugvB3)?SUChMY`}Fk zDC>ahT~Jbj`sLkG+?6o6(++YvsLqAD9MljayD7wm-W1{hIhfq05EnS5gIX4#g#NJT zGpHIPyD7wn-V_p`q$wl>tzOR_g*Jsi71?xf0S0m~1x+C_SW`%X$fgj%@(YseK}qby z(O>zvn?kTu<|c3$F&>4pJ#?4nD5yR35>yd_Mv%be7;=A-`RK>j4i!oAc<;*SC(sn>xSp-2=e2G`JrASoDzxTyBHgF_2R1^^>DN ze}S^>4e)psXi($KQDpal+C+%{t`M^OAY(=7P82r|-G+9h zi#~%o%OJ zKS0|=MIU&e+6iLt;4cVE(;8OD@C`H_aHh#g00oEaVS@cN^ z>de2O(ZZA9EDh^y{Ve(j8EFB#@F%E=4oa7xUNblm{4Dw)19jdV9`K04!(S&M&9;|C zALWn!e0>Nsuqtr$W6?!MP!uG9+8Mv{mxG%};Bpm|KEMeQ?rLaof}2M-e_a9%VlaU^ zUZ6pP(?>tT20TB422^1^FHk5VH|x;4`yg9E=>+6(a9aOa^nn}P009^Bzw#l)45+Y2 zl=DSj_z1U?FbC)`+DRg~+ezZEG!|9eGwGBD5D_h2}fUooD6a;$R`(Xzl020JbZoiD6+c?!0v{`Da7HRCd&(m zbARPSdT7|3EsAh9sB?qvVo+K17*w=_yKT@seCg=VUzZ?*>rh8agX8vBKBNx^&Lyx6 z0(a9(P?-cO34REIdSUC4%cS|>GU+8~K;&2cc5nygZJ_r17R-`P0FpO3)|-l~tg$_#5I#bf*g+y$JF@ zC^cPteH+|b0k_M4<*x_#-=GOf44lkv7XJfv@h%-b`3sb&mP1-apfp_cN%H8!*B6dH z1cmI+;(MTq4dQfg;(?}dQ2h-W9=HpVzH}5+*nndV=14DJP}(?R7N$mxGTRSalm3|h@WoeV1D4uGl$@Q~6& zP~<-ZckHeleR=yNX!t<(C}{NUhoGMVxId^5$%N2;Ab3DDTmjtn0N0Zje?c<~mOfz7 z7f8Ybdl=+L)WI}Jh-ymyh5U>`m~DF=)Q(_R$}L zaSGtX0&4ic+DPDZ0?J0vhQLLT3UC1fZWE)ok3fU8@Od2U?IRAt?IX~r1*qnS&tN0Q zsKJr@vgi|NYyvuojoyqCA>2Naz}r58v`&8rx*4Fgk1*!f@G1>N!2Ueemc*P)i)#00fm&pt|B$KBVUb$<~lIASjqXqiyhD0?+**wE>ZW z2~y^O=1V}?2^3Nni@t)&o%xV{*lXxq7POBD@*t=^0q?2aeh6-i{K}sW9!dgr4ck5}+^x zmqWkuAqCS*&@|uc+eeYY3|20^g``?&xdg3_QEF;XN&G7xQb>Wi1K`Ywl*TSW3R_S* z1b6>nPKOkq5GR8g&Y+an4sQEg1f^zB8HJLN9~ONEB{Ph23ev0s8x5+@K!q2`E>QIS z1hwiw$rRk81345|`v+8R5z+nu&H5m7cNM6!(zy2r9$yw|_u&9%xX*&j8&1Q9t?-)GPXx?=ArA4S}XxKudkVOMyV$IZ&nq zRUpXYH=y-CsN**VM?s+r_Vmx951^rF#Q2f}c$^cF8HzshBL;7N<)E`pgY#(1JQ)piwVSo8dJ) zVV(pv2*4#3X!`SK(MM5m-3kukpP=OQD}On-hysN-D4fAFg5ZWOsPXU-ls!Q)4$=*d zR^%}t1=tu6q@03{FhPS8X_yf{xescNfg&9oC?E?!R>D#$=h2(sm22SQ6*Bt{%CN{$ z3mUyS44>%-H!jhmRv4OIq5U#YVFmI4@<E-R4u)P2F=ufoq8`h)f z<2MH2#tb;MKr0&1IzRaMO*p6wLJB}=8-^b-c=IbiKLOl-MnpZNy#XqVKqENr0Y~9g z9!hQZ5;RBvSrGwVkpLMVfUMiNeefvSuuXgcD3U?r$dFbMI9Gtis6g#GXl)BBm##v` z*+8u~P+9*ge?GYN1W!0nZ-Qn9;oh7MEt@cX%8TsN_2BXe)KWqBEU1hEdA1!~M&a}? zZ1Eiq|H^<1bZ|okT2?{YTcB<|s0>2!GdSNvJq{Y50#&BB!EH3?_!Lt44%P_rIH<+& zD<9HE0+(5!D)%C&Xaf}i9s!fU-|370kNF!DSby=?!Yj{w(?=3LOIiwcS8t*RT~ypm+hr2gpU>G7Q|P0F`PlLB$*>(}7F@ zCk2pE$YVhYN12Z%-qBJh2V$; zc^(ub@SOM(RK9^i;)h@uynI^^E#KyY2b{pemY^6#AFc$=utP^+K+>?`Cv;DNmJWkD zPS7F{xv_=pDbT<$sCNS$9s?ytaIu9p6bmZ%;8huB0E5ar{B11IYEW=Z?Qj^h;2U{8 zCXV$>1n28O>!;xj5Pt)V_7uo#(2^e96$FJhsGf$kr$E^nRNT8ez{*tAvK~+61X`vA z@(pOQO?(2Z>XdDXL zj2)=O59&i9h9bb7L*(!Vt?LG*p&y|BGip$S7D!`VP5@dJ1!@_9CSgE16tn*a%Fy7p z+{<4mjc>@fIC857CB5{6+gj+Y8k8ghEqjnB_Ffi!f~6Wr35B(hKt#gv1Lt~Bg$L;n z{>q;YE_|SIhg4O8(hn#{QX&0-jKS()MEU_Kfw}=l`f&iIc5q7@TugzJ3N%rGhry9k zHK?e9=0NmR4N5QIRE@i>1xhdAR1F^W#&i{Uv=_9+0#{oLv`GB`Xo?{KZG4L2`8zgZ z=kIusTNwEPSld#d6a}hR+#PVYr683Gv`)a^mP&`0NAU%)#Zi!%3UKuUo3?}3oS;z- z@UR*vC!yCOTnP7r+5?Elg){>}WfREhkg^G9l?a+p0!21~i9E~}5@b5>S3c5A9;nX4 zQzib&Cp?1(N_(hPBe<+W_Z(=&AX3$c-n@kkJAh~Q(V_(0?g7>M&~gj1Bmvf0goZY> zO$Tb?g6GXZ^TVLV)35yb;I9M&El-@v^A*1sI z=kq}29;lwgF`oyjEAh9rq+s)T0pRg0dDwg&Y`+VrLVfrP(R5OP4348$hDbeStm~aX zlboPj?G75T{{d=wqgLvm4g%V^%|}p3LmDFBrW0r+3%ndCAKU?CfwZ4~#=->@`pd~M;lW*Y81-z93nh|;C!E0^UKm$aeMMqfH+CWF&Q3DZF z%3ubfG;DB0;V8IM1sm`BmA@X`Li$;BffXK>;Eo<7Ec?N|N6@Uq%c4)9q>Db6i8VMm zVZjM5mO#Gx37KR&0jXcG2B$dg-~=sK2QOVR01aD#3Ju8MJZ#w%{@{d^XrPIeqR*_L z-~{bcz!IF`atIWY$OBGDoAp4s093?&1hp|i>V63NfqDYq^av@ve&s_3&3_hMV1oxI zxR(YEPDoGi*CmiMK;3krf|COloRAXc^`)afi#~!D|KSTxG0fn6S@aP!a}N$lP(}ie z4=e{a!af#VWQMIo1TEfxmOK!Lf`*5oQ-bi+jvPkZgvYo*%YZ;@GqG>IlLWi1wXh^0vq4@dHW^qZaO=nErN3`6sSxBHAvhIaJQvEizGq*M;fO= zs|7$Si9pK{Aj2Bv(6R|!bb$6Vf%;3JNkr&$A-paCjZlG_yU29`cohvw%K|i<@(|SH zgRZNAmQlBlf(Ao=7JUMBwF#}Qfy`;V1i24XZxOM!23lT08dTsm1gPx*-iL+K<^xp% zurll7FYpd8HdtK;T7-1v=*!oSAz=%){#X8baJdC?24)KhV*n5u(4dv3pn1?RP}Kve zZoy5kU-{b+=AP1Fc;N?sPM_(3Q0+ng=!97ga(kjA14ldWALCy*ba?sp0mLLb0Z6If$2RUd( zD)zJuT6GN$azqOU+(P@6za6sP3RK3yf*f4NL4zDz#vuhcsGK7@$T?s^4lV28LCy~y zJp@g@;#xrl8C60Fa!?+5So8^0?tw!aH2e%Mu-n1q-pis(%&;|gpj~sI8u5o7fXi0sx+?In2573~ z$kCsWbybqES~>xx|EUj~UrZN(EjR231+71V^#1(|a9IrsMbKs=&=d}M#t}3& z3MxaPc>?4DP+RvG$OX4SlQ19`w1Z15_-q%tBe46wA3Q>jHwfH7Yj+@Xk-zdGBk~gK-3f0EqPqioU_jbgINSjZ4+BtmfJ-~Hej$$V02f-Mh6lI|!r>0kY~~L^ zH&Akb>>q|r8{!WSNZA8hiB4pAfQuR&?tq3z04O}bg$*cufckL+(g#Qhp74N86e3U2 z3mgT{fPf}GL0JTm;}KcJEdeyE2`){*ix)x7-Cxj^nje4Nf}Rcnnvejs8Ijvfpe3XP z#;ZW97jd_nAU%*Df^G$9}FR1wjnHq$|6Lh^fWNHvxc7RGHaM=M{QUX~6 z4ssu;UkNX$VC4s7Z3)P2pfV&JREB_-!hj~be}NWz+yZTX2d`U4E=Te~2`3#?j)2PW z*GNu6cM51t6?skpl_>vToA>bld*+Q11AO{zK(-CSKf-N_J<||M+1DR9& zmA@R^Mgq6CK-QKvl9FD5L~B#TSF*wAjsnu zpkM=c{~-;aU-^)N2Rv>8UYi3ef1QRSE!2L03K!CP=;7bv(T?eQE0qUiI z0|GHRlK?8PFh*yM z7qqlFee~ro@SYZN=ook(sAX_C4SBr*#`#a6<{*3!1U3`}S)cw3v}*`yrxbM90aRXr zhnW;0Lm{9*MDL12jt={kpAO0fpkZNX*$V1iz66z9AWe{^;NhS?Ik+2J4lTF9om2Sg zE1ZK!EYR+CI_Pvcc;6aymJy-h4BUNdP&o$fTZ4M1U`Ih(SKyU7&?*_!i-yekfd(v~ zfdcBEA`cCL{D&SWg1EQzLBj<;P5}y9u=S7z*RTBT;00RXG7TC$kfA5glnc%fVups$ za%i~*4frf$! z1rlUFinvjl_2BXkJW7KcNZ|4hDUd+Z>x5DgXtgw`$*>$;4q^)=&@v2gASFOHQGgaP zgG!rU`P;$mGjLf53nXw^h&PZJVSxlK58;6X+QLaFkc2Tu51}auR5pTB5~yqhN9B6( z@Dyw%*2|(xEU;lB$SgB>L<*E!z=b3zWuOE;ct9Ow_{*YCpy6Uw2SL1Ux>4Z5HcSK9=?y@=_{T`|T%aeLA?^ z0?Hon**)+!AJ7>K;G-3=u4I8X6caESib!k4f8~RY!~?Yy!SlOVZ33N;2TEU%gYmFh z25*yqTXg7EF=*cwlngSOPN5R_=(3@}If)rW;gLcY*8(^T61sc!= zZ5;!(V_{u3SegONY~UUWgQci8P}Vw}M(y?+D7g`8zd=rxz}bG2gD&R=%@xAhZ{WlR zu2DcW3CgS=cpn+)ykT(J14YW!71`l}d^<~i~P@@nuM2cf51kz}R zmUFN*UC^^+VB0?79SOW02v`Dyly^9T1(pV(JA**m9zgXyxQ7IZ6<9J1gU^IbN0fUw zg9WrA2^=fnvJa>Au+#`y(}gWo;0Y4Z#zK^VIITydNl=M{(|Saj1S!L2Jv>boz?*o8 z@(`!>h%^Z@7pL`zGzn6M&3af1a~i0V30sMS+I#~iPwE_{_9b|p0IQYoQBZKvhHfR6b{M!=1J{dKtpx3j0b2>_j$)h4gE#%a#TmMl z@Xd*!#0P1IfxDqUi!Lx@Pp+U67i=YXI1Z~XA#2k>$FG6P8+0pS4P4ZA7`WRCS|1J4 z0&adl%PB}Z47@e$W6?*@cok?`8Qi~I4(?xq>Jd;I9=wVk)Hnlc1h?-%Spzh&inPNN z+zvYm-KPpYlN~g!4YmMe6l@JX$VgZd547PKG&~HxfB`--Ckh*x1GOGNTS&!^K1AC8 z2v&?V;szU!dwBaH?y)v*=&({csIP_DYy$0X0F4BI+iK7e8*mwhvx)$fW1tEW-TU?89a~d}SM= zJ%%X%a9R&41HsmV%Rrpg!$(iSw5>p(XI94akxO zYVZblYdbnFWpo6PGJJLbH;|E%I0oihi zynYANK0bf_K{Xn3ixD0sunm-u%}L;fJ~U52{RlPzbWady>IzzS zLRNx+(-83?CiF@Tv)@(X>b7yDy+db7=WkuL96J&D@I`DGPL&%n*W6E5x`c|gTfI!aRVy9K$|W> z=0Im_pyNls^5Y>#s6tHy&5(lblL6&iP-zD`?+UaT2OPRkM}fi;T!n(lG3W`apd+-P z3-iI{7}Cfho<=b&(6H>f1i1%wObzVAU-^*z0l)Gg<8a`-N?hB;a!qzw+C`N!x3EWkr|Gl@(&V@;D#JVID*@4Fn5FN3Xn&@C)R=EA76Tew5dS5he2fma!`U> zZ6pRIxIDxcl+z#y9h6HzsS;!-C{==Jl2Ro|14d|qd<~8cu&-f711JMN1l_0tI`rU& zAgp|Z1Soh9I%v#cVgbTc*3RKvG zoP)F`4Ag!DPq%{8Bcv>3gB^-@@+fE}&BvmTpq4*qmIi!?%Wcq3I>;#GN6<;Cph^RL zV#Kffc+hwq#3bzA7fg9o)xFbNFiH}7eK^<_o`yuT*P+9?*`?Kf+KSUEYFMuiouovJXZSCNW zENB)V>IKj&J{B)<9tG_sM2|(-sH-15+#!V^?DS~x=mm7}1vLsl{dj~IAjKRa7C`NB z=twAfEWlT*feJB5EPx9<*tuljRclx}XQ1u2*t`HA&jp=|0*M810}#Ht7c>-t)eErx z1lSRfrW{f%2;zu zA`jLR1(!!yJpvk4K*R;OVTa@qA>1ATt-%6&1l&dg?|24n$^&@-Hu(xZ^dGB7K)D#x zBe3CTaM^Pd9Qm;2ci@p3xCam;d5HZ#pz#TiN5F>(9R^%e7nKDfOHng<6rzaU46f(rGQMV~ z7l2bmdI9)CG?=kJL1i9TC&)6CB|0ESJuLbJD)+!GNcd7WSh=?zQu`HMV#U3i4OHEL zF0+L;QDN4;EcyW2cn|gss3{Eg2&4%Jsz^bF(N9qM2X+}YKY_|XxSt^HKu`q=Gapxz*!y@O_@ zkRl&BszBu;!du{S5zSkm_C9p%7+Y+C%15xbphLO8@*$JAurl;=`+yUXzV_Nb<1FFIf&<=Whdk?4{+Iu({rHm6X7{<`HAK^A^e^Lm7`$Kfy+^t z=fFK{*!UiJSPrM>KqFSzJqH`+o(Agk9t9^DP-PD)qQT`Uil@NML{P4OS@a21mV$>` zu#NPAcU(Xg3&P7)^!_xt?M28sAM}<6Z1xOWdl9rK8`M{Q4QXP4uWI}WyYm`+&^f3r z1Zo{250HV%Q1td9Xyg#pBv3FAYI%aX!13_r17ybIXVC>5jZaX)1*)XLwLaurNKm2& zIqzrD2RtnwP!<5ywr-&E6093C2?A>`;%+lwuWFH|U19YssD=bx-vw?If~O+j^)09o z4BkbFGi0$e2Ehd>Y9kO-Z-YvAsCl4*0kn4*RAqx!I)c{hA@6qt-6jRv5)9ez2%h$U z9rB1g$OSGa@egw0Pgj2M_=7CmL+TTgmAoM1aj3 zWLR^N=+p+v_oSsZNdFSnwgY#FK?x3I5V;8swQUD2<{;C(c+wmsBZGTD(2l}K(2`D2 z-{^;6A2{X0f*afm#!Pk~H-nNLcpU2^V#FJi_7FF6!%|!uC=(p!mj^Y{LF-kh)n0_G z*TL3agw2X!YcGPjWk^j1Xdw++pM|Hr2&>t^bGx7!575{jc)JET4Wg9&P!mDv5RZxQ z7AUC9ONTcFAX_iMsSw3bEv>%l!?b)3>jRggVD}+A3%)Y}Jj;iz2!f^AY2X|H z_7_rf5v|{X++2jDS=cq0&<;Or3kSHl2urb$auw8CL|rdUZhH~3whXx(fo@Pi2<|wd8wsimaTp03VnuEDL5o88 z0hXK?$qUpL!C@q9>>Rb-2Q32OO-Z!jb7+(AB4l?jdWiw*&Vo`{A2^-DJPvD0@*-RZ z>cN150aSLtnv$T#4X9Ix+LROoZ!`WO*ak}Spb2N>{k_y`PeRt3VJpmG>x{6qCz&84 z5@@vps2s%8qJcF~v9>3%w@^XlAs!RqZ7)!nh^0M=z2OBa7x5ShZ}x&Z%8&*fv}}al zTLqct#B8sC%11or!YW6kh9b0#gr8%8Rzrf?0xyd`fyzld=E57*h!I0@8vK>N9+YrE z4NxAe(FH0m@t6x*91I>{1x=Me!U4Rl8fhew57k`c_BzU-G35Ff(%S2w@)O)%hp)23 z7>Q)WJ#GxzH;;Wt3$}U>wb=+QN8y8!passDF$^k8@q`#`StM$+5n7(E2R9#~E5tF4 z29>FJjD`=yfy-4`%L-bq!q4vnEwRTm8dSF8F&Z{Bi`pK9mamYeCg$iYG--m$SUg6< z#^_PogV1so-qhqqO+`?nK}|x?MdP5;CO{W-fNEXX@cc{Av<#^5`5`zBoYr9hf+%lM zTCd1WO;A~je`FGLhAyac1Sxm<|`l>_bp z;1zx^K^H_qM|2_Ur_rqq16dglIzR_>85U@n9trk>&QHR#zK{sp9Y80Yfm18!j52Ve zl0-Lv*0l74dlJZFb>R30jnxt129W3Hg9|}WtxJv@3P9(ifyWJz-2m z2{=+=dq6pgm2e}J;UL0sS3~0a;wEhpgAr~@81iD=74$o2Oy^4^s1Di8o zjJbMo9O6+l{=8rq>x<(E_X@sl6h_pzAl|zt&QIK5*E{8~T8A^GC>@x5`8;LFhC!}Bb(481asUEx|4aH?B z^*6RWI1QzILUtKsUiBB~s$S3zQt&;)@N=9l!!A7sT^RxHm4ebFXz@x50|*@bmCp}3 zzYcu=`AblR0%~7@Er1`X3m*yrT`K^(l;DRT=xAN=@h6~FbVV2W5d#?DV^0tz8tCAC z$S4MMAr>eQKZ2TVAXB01?b1P`Mc|>4_yABh1=LFhuT6k#Er55p!Ar`)gN~q95!57D zA0E6FA|1XCA9TPSXypfZ{03wfXpi5=qK}}KDrhtSRI-EC#vqvl8(;trhJl>}DY0Pp z?txps=;nbM)i}(9je+<%z?Zwjl0LZhgbsk9`V`du$6+39m?rMKA`?2 z4*$Rg_h7?VU@yVu7~uOcK;wI;MuPgkIE;jiKZ3U*!DA6z=)o@K15W^;hdQXMj>AaU z5GiaJ3*t&}(f2EVKBPGX+HsC(IR7zr*0kv8Ch#+p&X5;RbQ!${Z=cptbdgH8qag9}6G)DQHmOxQ|2<_%c8hnV# z&!P*ULJ?`_9kg)=>UM&9%HTD~cR|yMkTu9q13}xOL2WP|nh5KAgPU>rpfMbXi3OmN7S!d&bR(#Y#N$TTKtUL2U<_;+ zWa1VyunQ^|z~v-(3<%Wkh3yxF&l=%$DQs*5JTL|^7g}CIHlKmZOKj$X%1k`&g$>7m zm(oGZg_fI;{b}HG6Pvl9vJ;QFuo0v%c%VZ1nBemBCb(w@x)B+4c?)Pw3N*S9r<8%h z3R?4nQYa|dL67ghS@c={=*!nvj{Xn~D>(YF=rSW{#sidSVS5r$5)G)s07`}PK#L6E zCv$)sjqp`2pt2O3A3^0Q9zVhc|6oH|kl+J1A(7U$fX4qYlM$#~#bY#VymA^ilA&1# zGVq49#sxHFiD@*be8polY={*$lm&4$w2Xx|6+vYzwg3T@vv`b#jS9ntvLHr7%i87O z(K2vZi_K_Id5gzr*eEt^C<|gVw9JJzBOzy5UBnvj67Mhee-3*KW6=jtQF!y{55alhaQ#?(3v@#@ zp*a?{T`zYOk`5tlOlS^=G*@vYMMyaYYuZDL$oa4$60>PfL~4Yz9bra7i$?h7VbD?l z)PN!)LH0q4EKm&%y6P8HWI+pXJk3#9n#9r^m4P1r+y*M!K+RFm^|9ESqkKo9DH2pw zL(j8<&*zfW9tEXQP)h(?dlXbCVpbfW)QP9!fYqc}+oPaqN=y?$Wg{LFVYM8%JsJ-x z4M6wsfa*1zO(swo37V>LFTgR;ceCg-Xr&Igy<`AtFM-NQNar2g+Jv>-p-o~~dnpdy zzJ_c;L0a_!T11HHHBbu0<26`|2C1P5Ei<9RjG)!r*ct|)WQxaJSeqj(0eLHYyn1Oszq&DLd#Y78X1a`FG|xAGL-@@ zU!lQv34DzwIM_ge4L)rVGX4%JV{x|oVd)sszI<488IqJi#S%C~(E<^cl;?pH1S}At zWi7lpN>OTtv>9PWL(5!va};~~oX7+{4P5r3H%B=Tm&<}$2Vk$mZW#roBk+BKpl!?8 z=XQ}2Hlz#&Cue-kQBaBoH%GzcFzR|j(%PfK(83?BJqqf$L)&&3?NQJb2$=0r_}C$6 ze@QsBO$Y4&<7-8N4q}4Zgxrdhht}TUMk%P!ExN>raV8;X{0tNfVW7#HbkI%)a61ih zrXV;gLQ^1oSQgX`!_l;XxA&lK2c1U_DxJV(7W7^f)WJni7Y5W4hAhX%+Xw-jw+A|V z{#X8dZ~=?n0s+n3gM9*ReuGBgL6*bsKm(1=pnC>Xrr_`ld}J6NDDaU|(1DFuA_P?E zf;|IG;aK8C78)mzg?!)*bp!`Xp-oS4yg`hhB zEzaPPQ&9bU80kDN%G#@-;Z9ubRZyXc*#sj{v%tp?LFHyVw0#FIHwjkE?g6-4m+;Xz z&{~@G1W*%`5$jMKtWgsODns)@OMb!aDo9%tT!vz6&Vb5MV%mRD{~%3Bf$B=oHI7&Y z8bReLsMQQ9PYE_SKxZmLTN{w9i#+5FZ8*b*yW^nEcG%TRpl$k~@)hT(H>hmI<3ad{ zGCaWGBdMH6aYh`dj0JlToYQc65H!sN>KQ<0njRKiW+rYh6ra; zdEoLF=2K|-yBs`P3NC-KrC3lIjK`<2Ne9Rf0Muy6XaRUXK2D#4%3`oj!DTT{pMs`B z!9E3-#}A7xvx4FjJhXt#r=U%nV4s3h3e-7}C5@n_0jP!fv*-e7do^Z(11gvC_!QRX zn+ERgqc>$i*GNET<8d`*LFF@A_Y8St6;wupmcK&E=$A#85M?y9ra~E61>NQbYWYk9 zm($?+eZ+Z*H$n6Jpw)?xMN!~Ow{C)_ppfn_d0F%k<|WYjqi_XKy#T6Wz!fLz>O|1_ zXrNmSu--ooZiYj~jKG7cpu!k*76N*G37ZH8j|cdJ)-HqAF+ip(;MH1u!cpWQK~Qmm zR;Rrz`Utw24^*AGfy!R+T1Uw01kku57T1HeGlQx$KhTOI*jW^ioz&35KxnfZ-jD_j z27*Quq2~pG!Z{3dWDU4EHXS^T0Uj2`+I9k^Y;Y?Bk~u-cji8ZExZ%j*l>i#+1!Z2y zsXRD?3{+f#Er*Pyg2#B_XF!9dQ9*?=@c{=p6#=|M7PJHothwkSf#8G8wj+iCAbVqo z4?##_2yG`pE_Na!5WzE0&H<(3NAQtg(IY|4V#gM4ZDIm&F};W>EV+Ql;Hk2 zEGWTc=|{pr2`kfEk2L-K1)ki8<>x+7+Bgh5Z;Z0` zDX1s}RUDx4)OgT=U68R#P_d4q)dkAtpbE_00C#H=R!@TmRnr|ntx3=pKyZHmrG|!$ z?!fN=WE`k~2RDh~(^5nPW;?jd z#Mv5vhbOpU4k|Yx>)Xi5%<~e8Z`*5S|CFhoEho`QU5=E(8cSxj^}baH53_e8L)+;Bpo0Hna$bg)^ja z32PT2%2wo{1~)lLPs89X!0(baTIhrGN^@(t33+qOd#5$@PQoE_9*BI27K*N_$VlBZ~)sV zC}^++#-WqFW9h_Jt3YLRQm5M=P*c z4(hUiEeCfTv04smg+Z2qf{Nt%;9?fta@a^Mq;n2yx`6X1bT9#n{h%=*u>Iitht+=g zfE}csf)3b$+cD@)fOSAn8=~O61Z_}baRO+>2J8fI0gKfMQm`>YaODNZC3UsC!7AJt}BCr#{g(_AjNWz=|4*Qoym)Jp@vB1SGx)b2Tmf*05MFhA|g?5dx zH~}=?0d@ko!HU%h5-=x#;|f}M%m){^=uUw5U7^tg?%7O-6|2HWvF1O}Km^zc;35^P z6U1Ro0LSUeqD!2hPyiRF=uUu_UEpX%Z^nusZo2_>uE34+U!YDRs1sfE2{clG)+>cI zV?j4pfx4z`pfVBEjOD`Fj0K&{3F?;ifvQ$;@riODAZ6`YP*IAdJ;~JsS^g3WCd4oK+^Md+)Pc)k*gzdPgJW|Jyw!-( zhy~|ca9NBqXh7vL*g4>gi_9;8l8|X52C!r=^Rj*4R#K=%*N>)(5x%iIpFwz zSacawZbKFxU<)14jV55{fRg}vGgt(<%e)+GGZ<8UqxG7Ro57$m9MpA&l;JOnE+NWr zXtjef5(_#h6x61h1}Vp3;}K-Ee__)Gp!P3l5TfWJ7slK@sNw=`@B$CDhJ%g*hm4|v zsw^xOHz)=`wIHUg-u0@)0i34=x!_EAI7@>I|$HLjHf z@WwJ?=oMTT!^&820S+I6MIPh=)gqwQ0XQE)R!|^sUIBacWzj{@avx~<3mTe1_dKX~ z0*YtIs4J|Q2F|b0dK5g}fZACAO_Uux3OnTgS3c<%;k+#R2up>K{y%Kl!ApFJ5wcbk z=16eU=V#Fc0!gwRlwT1036aOg;cZE9aS3Znf{R%6aDt^x$l@iK6TmHB+|5~73WY4u z!kSKCi4?kO6fzwHnb<;i0xX?EdhswPfLp%ABvnxB5~~wnU8_EDu?QQL1sA30PJpFW z@N5u#;Q+Yh3rescE67f;;OK-WSg=tT2^QS^h0d0MQ!O}d!3p(Oem``J)=$ti>$i`7 z{B;_1vlTq8f}-^>X!HPeJr=zE3%>6ax?c-Eph5Wl#D_(n#Eyb?IzTpUr7IkL2r9{O zRVeViSn<%!PT&Mgux4|2z};Gf)w19LhV%eX8x3^E8n|3Q8U2M-_u#bzpha7t@CBz{ zoc4oKFsNq5w^A0?1_}cYJcAoRkirMcgg>l3hBR!8C~I-L0#x3DTmdO>an|qfb}C}n z7Theu78mdaGpO7=b@&PmaF)bp>|f04S}4+Z>=IfOP&b8SP)#>L5h>mlLD?3##uyH4J+D z7e`wZ)LlGy^jE$==+*}4Brm+N2A+wB6bo*k_OCmrjJ^b#p##r$p)QsH4Jm*M18^f4 zR7e+Hp)Xz;m}2-KZ`DiVIR{1bwt4JRdBhCwGjYo3PA>ULH#t?$_3PY zZLpEWFvQ?4xbT54?7`AZ2c>9G(DZ}L70jmW!=g{{%OOAm43|J79qr)47&?Q38bI)L z1NIDT1@6nDi#!;?1nM$@f(f;83oapk7F`g<9$2uhOgp#$#u`}gVNkHwVZ)%{UMqTw z3EpS}`ySS41Ls=YjW*D@3^>T3*%eEIf}i6H4k&20YX|2+tf2-Pr~@S{NO20gPV!~Z zMQ)5xgLVB-_C`$yWjoLgZyc>{(1-vyiGhn;tf7XzsS6%$Ko2$eC_XeyAtSfo(iZn9 zKBz+k4mEHwi#4Ia&i;ceetHO+uKAVU4jy5^8fx%93OFJ_eK=6LhP96Z8f*rqHE0}w zo!k#CV&O-Bf(D(@V-tFpB4%F>bi)m}dIy(~FG2Ts!G?(;hn9f0gu~Vl!J5XPCOX(1 zum}SkHH^J!3>pdr7pb5)I}AOKjtcEzag6pb=nz%x?NLyg1TR}aY7YzH9tQ)JvxkoU z$`5zIQCHn8`V87IcjV{~L0DOP6Exq6nIfSJ7(n+T-#+?75ZovRuf)9sKQ0EP^~`k? zw7lwvAb8mVXyOgpdKSa&T2Q$QZnuJqDx8%$tf>PT=mnL%u+!-+vaA(S4&w|w5ok<9MtebBKu|;XVbLY{&PkLY z!#&|u8u02-qNWg*D0 z1k4MttPdKa#IdXhR9=Ha6qNUI_bstEd%Lgw%S3bRDudkaBC2n@Ue##WJm#YFeO+$Xzew0 z2Pn9#0Ik6$>^w-(3u;Dy40#D!LW;|UD2E6@Zb>J?k>IkF%pqY|%tFQ#kcNbD$1m!} zEO4GB!ja%A2k)_`M1&-`E`pufg+C<0#UZTm4=z24a3r{lBj@UL@X~ciV+Ar$3)@^NMb zzOeG84HVcY=Mz$)eJcU2qd|*!A>+N+$6G<|3sA9*-o6z$igQE{EP#@$*^yE+ypahWQy}bUaNxomjVN;or)g-pJ009G zMM=}(GL)zhe|Yi$#||h>gY;rL8hO+fp0JV2VBAenPHm$QWM1R3g{AJaHAVB7z>+ClfXW>2^x_AkC8!o$e=bP zsG)#7R1Mo8=ms8)1%)|ew*=M@hmBT4mfOP4@_LDF%o;wd2zDZTDI~~qpri+)!4pTA zNdS6P3FZJ9sDllP0LW!#;4v}Sy>;MAAkm`%v~UwtAw!lNK$|+y)3G70BU()86O9YZGfW$Toz#Z9_gxU z@Wm0J^bgvd3YkB{8Y}Q2a%d3>IW7U~5IKC<5?XwM^Flv3b0b~Vgk#teG_;QyE1*s~ zI99*`1q(IsAOQH*LN1uE-sPnm$mN5HWHE?}|73TT`W+?@v(pAR9cd?1$uV2u^{5Ii(= zA$J2{4#C5kyRa#!K3HWY1iCWH9s#Zbw!C=mPgAPvw52b>a z?SjtDy2N)Bb7TWL{|6fk1TR|%2hE#-TXy*YAPYoc!v;v#-b3f!agVQn%41N3fcK)M zgUVyjdALQF;CJpI1|uJW%H)@?!L9|heqkpofXifz{pk;jK7q<*uxe1b47s!jJgEDy z=o0)u5rpwCi#~8b$2yS?a|IWCKZ`C%;2$vtkAs2BXXtPMS{U3c`V1yVfOl{iPi zC$nRVIndMwI2OQx2#YyH*^Vu%fy#HV9&lj7Vh&usKPhK+LXcV-y<&zpUqS6>*hvk)@-#dO%GMH&9s)+l7pIaw~F69J&z?bX7UD>4|nK@5`c(pfUs$8)2Zj1YA2o z7DZrf{K1+TpdD(^!#}}=IB3QJY6EC_2GP9>P(cau6J+=T){Fv|W>_1y@MaXa%m&8~ zY(freGYV8&gFFWrqJWtX?!mvr)^CEh-@zjjQ1ju#!tlBPb?h2cnt&T7V0V)_EDTGZ zkYX8}JmF)*peb;qjutGLLgvGuj)d&Tz?)bh%er8J0PbABEV?L!(ZYu%S=5uWz=Z;A zqyk*QlaY2I;}9?>fb&1z6bu>Kg?SNN41-%Q=y?N{h*6u%;361rI)*ftVf{34@e4}I zpfU{FY$iP^g98fYMR19WKPiJtPM8VL>N?vw00q z#h_3-j5?nN>ea!@Tj-z$Qgeu?^}|v~?dx;}P-P0gUldWvJp`4(&=xDG49-Wa9|oPK zgV`DfmBXNF6udVMyov4-s2m10iLtCfg*R70Wijkzh+p~n382O87P^V=S3cwp2yl|d)=B^+Yp@1T*$iolfD1mH zjXPKq0dxTZXz9kUe8>qqzw+0EOEhqz#@+zHJF5#Sr$PRNmD3QnfXh3a4O)1!2<&87 zvk2V21}AWA9tM@yAP+;%V}Lb_z~%MBqD$~&!VxLvWzh$CJ09#qs6CKFB*0@WIL8@5 zDIQ$;fc-(?Ff=TEL(i0j>~Vk`bdD`4fVQN7Jqq?4)Y0o9jX|sj9zO(KZT%D6nVSc0 zCBp&-T#i31x&%KQ5hZY7NglGF0@hRp7g4|R*Mo~5a9NHsO%P7|ureKzCcvc=cz*>p zZ^Lo`WC$7NZCK%i{)i zhahC3M8MIPMHfY|x5U|xf^IbXAqYOY3~A?`B=%8cP{Icdj`@Q&PoTE~kk|7DfRZ72 zJui5$1naOEY|SEnEBvt|3}cWUi9z-T`dYWA6(7A%!l;lL1z`An-4ke^M@d$ zDGf6p(vL5?D2mZEW`m|7$WAt>`S1gPrLd=9Qpi67OK?p5wAzeanS@N>zqA*5~ zzws6SN!RSE(??8b=0qSJ<@@4q4PWTiFXj~K=B;Zhhr4n#a{IcjG+LRTn=?&{D z_rZb!H1dFbjWTE;5nKd=LgVn!U-?uR&z44NZ$rkjSurm{1Mi4}G%Dgi?d@~}=ukFj zcOSO;8?zo}rFbP?E3_auF zXVC>o+#|`LG8#0n6%M-N0$bx9dA)A{Xmu*0oW?n%1)DJjtv&`VeSqE2hAlimXApsX z2w$|Z9#RrQ&LG0wVS$WM!)kSKX^6882OqHom)o$$47f!6l@H&^Cr-|60;v23)!L{7 z)!_0Q=Wr)zxh%-fr~}oAG8|h<0F~olJz!VEybUhLUxJqXVhdZ)N+_@%a9Iv(G$G1z zZ1#Z4bFdz;7hv{)%kzgtmqf7+h_gY{9b|X}Y7cyqpA_!Y2P)S=sSmOe1bYh_X{kP> zY=;f2LP|A^15Kd|_F=mV=7GHnJ$(mq#s#*Z0_}$YdmHRis2AV^;aKjld06xbRL+C- zK+AbZvl?8&JuJE;j5Vm>y$Ntx548t!E)ldBfoCN)IH*7+9FBwvzK`bTYm^aa@M3as znU6EO!Y1zKfy;g9J|)P37ubRd-thvL{ZKE!mpo%1WdN1`;GhDB4lGrH%YV$NG32H= zyc>dC2jFgsqt*jR=f#sT-VM9F4pi&JD}YW<0`0{^ZDqqoQ^8H~bj0}=@Rd{W8c+n< z4gvI@QJ1*#oDwJN;4$Jug+H8K1^3-XY5 z0>X!EP#YJZ$x~2u>IS-DoUC=uu)qr|0C^I85F5ByM{h{L0ua&yhczU?r8#(r1KrE8 zV1z7ChBYL>W#UKB-ad5mVSx!5zJ{3(E)VerDP-*j%zSWJ2<{i*nw&sAa{^YB69`_& zNs?p^cf&#$((ne40pl9(h6gg(U9doalv+g>aHmzskOwRfz$F&`Kn9n!F!RCXGyXsZ z7q&3-p{3775!?-Au#=(YPX`x2&_Wa3a>1U4+raK7bAdEGkinjT1v0n@Mo($*KnD8* z>d58bk{5qUhWGL4r zKClwp4K|7 zDG&xKlaba{OWuyuOAj&5`pCk>?67m=Rk7?xNHQc zc$~xCu;7Q4+Y}w70gD0XQ52vV5BSn+P~Yz1YlNd87JYz60@$6fzyTMC;Nk^a;J_mR z+)aQ54!Ard90}mE9A*!=_#>F^A!E`od%)!&;Ya{`0cy{DNLdJ))+ZbZV1H1!AR8VD zU=PA`0%-OUZ%zRF4eDt4U^xB)XCB0F(7|`uXg8?248GhLC9+|~&OC5g53>hR)?8`&jD5Tt zHn9(GWXBtT8kCqTPhg97!R>3L@oxB@a(KlE$`#;NGiXdD9<)#oS4$i|h7K*y!Ids} ztOmVV3L8jGI}!sbj~W+cKBj!_$5v7&NbYrpmp(J#h`gn$S@*!1mk7V zMG1^n5-V(E6*`Cx8Jq^Ux1!;hfF=ji6cdkYXJK-T9#PQ(Wne(2_dMgYL(L!1CLAKa?_ z2pZ`{Hy?CTI9NTz2~hLFB_Mb*4c&au!Y#0RaA6D!e{jL~vgo2DMiKxWX$D?d4sim+ zeE69hG6V+K!G#+Bkz}w_VI#@l0uAd(vNUW!2<+64&?70Jg&AsZ7v5|Jm%y+ng_L2~ z`X-=pFmRB7T?-2maM6WzJ_9~@4|Xlgd{CPeG`J3*4~Hj7&`1T?e6VX_=7WptkD!ri z^e6>g{{dDHb~4O-SWyjcfWXZM?Yai52Rj*NKDd~ES#(hxHA-R4a`>bP%-vEr7okAr z1%3#&ft(E5(&)m#K%Mb$S)}$jWIP!ZUv)_hhtx@0$a2TZjU34hl4J- z#55aJc7x3ZmEEAdIH2kjXIln7)($Ga^AjKoKEdTTwh9zfhJ(!pwa1ZGqJYkF!mOe} z`-8x$p~fPuL;>v%!gM;QEC*`w?VzUT&2u7d}y!7hL`*uaG<&OvL?Sz;jfV{e^V0$3W0VO_IoBbtd#21@Apjj-i9*A?G_J9jgaG8(I9#FXt)&p@4)E;nQ z`mpGdB-Z4@0!uF7vLD)Jgly^gS#&{$h(UO8iHdU&9<+c7JhBXS`c254jgaH;aE>g) zrqJero&FMflm)nC#THcXJ`T7pfO=s)xM0QE#{ty`;GhD#9+s5Bg)7c^5%^#~xK4oC z11npx5B7uV1+YC}*Td`q6|$fXH0Z7b%rXMBb`Y!w>;;%Th`Iq=)Pw2=upY1%VD^CP zhlfR%#8IOjxfu_ign{`3R7c=$#-r5}=;JqJjF-zn6Ch%|oarcfZ3$~2f}8S4u5 zLr^UOswctid8F}j%$7f_1%WjZotU7} zdekN?XuTcSOsH|NotPg%D@xJ54{Kk7M#UkGGsqE6SR0q1`Uqq)WM~}LQUI6#SbG+T zmI$b+4{M1)N?y?M$LNhq&{2tC^T9^K%mu#qtHAtf%T$Bu43 zXjv3kJ=jQ?`H<4K=%N&AOXp+J2YBNZY$Vit_{wom0fuFi7gWcAd!t};$s9_DrCiAT zC3vkduAy{T0)`BYLmdfUIgUFSLzbDq0s&kcf>)@crw~|TMqO(HE)T&)AbKFfk~HeF zZE#tLby*lJVMCVn!Q2ln2eGbvhb8Z65GO!tFX+M3_)<7@4GAofg9cRaCvu1rpyq?i zKKzLsTnxk94=eI8o7?b24laUW;SVmh@h5VK6QJ&g71{8H8oW6VPvj6MK-~{6u3us# za#(X7p3=eY0N2jAM%m%%8{`C}^GvBQzAlH;V_V;308r z_JEQ;SP$4}m_6Wf_+il{Db$t4Is-;V1WZJ_rbSJV6z9786d;x(7;&_DR@CcA=n}T zmL=waTjel&z~w)|i~((xLlZBg0LHa)9+o}kL7W51c%VITpmLpH4uP&Kf!YHphzVvA zh;yL!fD2>7nFL%Xz`_kuAcGn{*uo8-Ng&RF+5;|+31iFhNV4^+4~TOt^mT`o(IiSpt>18 z*bS~$;7uJ+MSyj%8`Kj9xg0Y14sYs!)MFhPg*OGkWj@S&aKVhVDF_;50hpY$VM6;8By0paVP5o64}x6tvY2ZK^>=OhCm8 zdQ%zHg9n=rHWFq&c(erT$SxmjvH@%)%>9sZw&((Uvk<%~2pSv$n-4Y;W+WVln0nBzLtSN=w-pAgE0+sJzvq9xMQhOh? z4jXl=ondM3WACki%6pIv*xUP{`3+1< z;Dg`bG9TWQ0hQt4G9O!022}2Y?ZMjK2i;kWX%DP#1X(r%vj<#IgUfzw_JGQNuszW7 z9};Nb!W!owJA9-SY&6UsL>++59#B01wg+r9tVs(lv|oY__Q2M3hIPiEEqZ9v3|w%7 zhaIrl1F8?e_JECs*#j=TUxJS7!DbKWuqCh_u+cF8fD3T&4N2JS0o4m&Jz%3@_J9j< zoRbFdejnIqm_6Wv9B01|R6l_20UHgo2V9uH1g+f1<{$Xv1lVYpJ>UWz=i~&ao&ehe zHX3FRxKMuy8llE!54?W~HX3FRxM0WGzXa76V0*wu!|VYU?k_=;AlU2yO_YN5fQ^RP z11{iyg6a%x@dK(iz>JP9U zu+cDkK!rVM`=czZzJV+RLhcGMK^x-JVCIAB5Zqk>)OrMZUjzHmAA;b8HUgk44?r`s z?g}6lsMK~h0I@(bs_qUT7O1p!4*;=1BPZ?&AQosLxO)MJRRFp|7~CX=U0?MpA96+Q z4?)N}5zt*3pb`sl^B?4Bz#oEb;HKuUe8_zPph;?wx?lP2;NszjAZ%e>J2>nA5QLn7 z1iJMdr0$1c8@S~7mER7|vOfeN8$y2Nw}Z3e4?)OA>RsT5OQ_=uYAa9`ac9A zcWC~~ha9H#LlCw$7;^CF55YEYHu#m_o&f6D{t#>f)4%c|(_B9U!RJ$g{STD~Cryz5 zp?q+%1Nk4y2Pdgt`R(9j0`fmd9OQox4e~#T{slcE2;_e#ADk*c{)h6x$GL(0&&a@V z7<7IQ`)GWEZk7ZWkHaB8`H%iAIPm}I&wNn^P*T5GbdmAs$J@6+v$h;ZZ~l7>X{mu2 z7jNGJ4U=&kz4-6-(VPDcK=vepayLleXTgpCM}LCu^Za@H(b1p9e?aPPAN~0cG<6#% z0J*{e#QRzB`2W$L|DJ#pGavnV`_<8(#eX@D{t!%4IQsM7(W4hZ8>>K!p9OC~3g3bh zvK;++`_|E)#lJxc^9+vu{0CZJ@Ph@+_*rllr0_OKA?s1N^*;pT9KiO2@8dwXnhnM3 z0Nhryqgb5)wh=t<3~~dq)f^~R7vQ#<6WQuz;1UhwU#uSILbiP!xN!!u7pnufksSb8 zEdla1R`2tmxB*-`;&cNqiW?w}FkIo!hvEis>59`0{3vb!m%KRLAb{cqaA}M)90XC^ z04}j{xRL*H5g7e$e_3ZT+88fgDi?0 zAdR4ZN6<>7hea3Vjy}A73AD&p9$Y>>EdB*5ZI*!>KM((bil3jL!8Oo@bN@~pg^I8o z{aJA3|Iv?NkxNH^790X`4jcs?OaKaTSlx-VUKvu}@|M?H()T^L6{}MsRgw-E`!o5+zrd{mbI5b zNfzM-XjzNH4WN1t;RbAFEy4|;go|(k-m>;GsD47Y0b17La095$LAU{1S&MK3DD5KL zfVZr@3`)BQH$cl;9Bu%mU4$F3m9>xlAAMPLSq6M>>cgVTa^SM|W${l$S^M%I$hP00 zOLageKL5LM6eggJbDw<6utTH>CuP(4jjD*+6@J2-+}~w7Q6tp zZ^1<&R0Sb52`_ustOm6{;3**uRNmsS8k7tWR)fl099Dzsak$mc zLCAkcj^Zejeh4lDH*0_8y9*rsS@7gP=tys-qoC8YU%tK!+BpD!k7Q9K%()7AY={-6t6!EUV&2D1(u^PZ$AW$ae$0@dlaPhhu}JJ2@GCx z3KIBP@bv%Dm;as|eOYvY9b6s1gmg8QflJ?)|G@5GISOL@EVu(wdmE&d>nJEGy#yti zAA->1{5s!FAw* z_g8+n!qJ}vZ$UBsi4h#*po=ta7XJYCJVE_SaJ_UG(l-NLRetd{v`GXTF#+{F!9!~x z2DA%$3DgIKroW5-U|mqiS^#W)A#h^{o&%QQEsrmO`dIKxunt-tGp_+`;0(CSifDnD`*JTL<8bbu;_m;XRRwrQYE9=}1S zShx#7dK{q6#!t{~2>&jDhGsy8+MT061j7oBg4E*w-M?pURA-E3Q{)GntxKw}m?+K{t0L`j`2ajOwUeG~@|87C% zX2D%KSioKcuXqB59>{Qr!6*&j-=OBgoufCwf&1^$(VNJ1!77Pz)T3R_Uw1uC6?2tw9nA%*Q@P^|*4tw1#eyyRU5FL{wl z|2H7D;2H=Nwl_gO1~s)HMKD@C+y$ux*K87qIQ}6BS$hR?3wSvY7Kea}Y*44+hu|`B z+xJ&~Jm@^KU!W@U!qJ}vNB$oL#TTe>cnJztkXf)%ZSeRb+)S_%(2dCOS{pRl59&#S z)qymktK~p;Ya6(nMN#&%-~p)V25!$o%UDod59)UO5L^Z>Yq5>JT?Q2bpq40j7+@W= zoW;@Z1(gHv>^%)zISXm`g2MIW(TAW0FgTYZm$T4TE~uO(uiXn8pGRx&Vjo%njo#h_ zZHj@XT}U$*X|xwsnId(VZh|Y*qoB*ELHCw{i>;$a!R?Empr$Zrjucchfd_wo2F_S}fqQleLT2bJV6!7Hd>E%qOR z>!7XQD~MPFm+PQ#cv*Bo6yc5^g3F+-UmUG~|5@}2G>vocDCpb}M)G6>wpV94esM7R*q=Sw@4072)kKSQ(8YR6uQP zM5tgZqakf!F<4tznig$gwEM2jQO74v!csm;IRL2?uYochv?~iLuThkMlNBUWTmv;C z;6+fH0Jz(VEmvLxl|KhTgKD7M9;a{=)RG5Z)(B=490oN94jg@1bPZH(!EJUk0QaI# z7F`34F@o#(D@Wn>{}4j2drSX>EzW50mlR+A|gtVnW zH5t|#45_OQYMy}G^f=~`!L=PRWwi*btd^odS&g+l{qg9-;(wrS_r0ShA#JaF;L+lT zMVHu*KD-T;29?hb{~;;)S?~eW2|@PQ!++nviv)fMmZ7YVcm_(XpxKH`ps`s*AqHws z9|pC4LCd_rN%b%!yZ!`^6@Ztvx(gh=2r88>f*P!^-(pRth%rfK#Gru{zzA(M9C@xq5i@S(!n1S%EK zZ0EEE%VCm{<=jvf73@EH`+-;VwS9o2wTuWSRC&uA?sM67`Ndhl3T z1|H7udXypQ;cHu;61w&icI9kCdBL>iA ztGFUb965cggSM`(fL5ua)cjC~{199QZd8NYh7UoNKWGpibVfON;SMN_uvY${@mfSY zfg9aFam5qL2;MgEcn8jS0=0snZAn-`4l0{og7yev7EnI~AqDlXe0R{b5FgmU1Ja-y zjli`wsO-cnHeZ5zOvIPpC^b8zosF~n=0}cGNTVC48$e|}JgqvNG)_)T`7H!1 zza{BYeqRI~3y*zJ1U|U{8jk~wkbox^Km|ES0NUU{UUdK(paUhom!N}meh6YLaszj= zLG>i4rU$QY1Mf5ePjsOSIf6Rk@L>q>eJEJRazW#>plfjdfLiNe4v+y~Q0EEU#{@CJ z27x-z@D3N`rf$fH?;X%06UgA@!+(!JqoSZ~<=|1#H1IN81@I6v*pYbMhhwZ6#eFDa z%_ug5r}6~AIl0PTpVLli{@5vnF}fSu#Sm?(-mmk z7G&iQL1?>~aJYh}`LNlKB@IAEG(cv8EI|qbG`B*>m)$`_blAr4(cKDeC4=37WG2{t z)Nlp2kiqKE><6nwO#_Hw3(!mi++4`u7^vR_8PNf4+Jd%;LG#wX@%-u!AW-k;Wzl6&%>ggr!AGn?#tlHlJi1y?4S(_|bV)@Uc=;N%i**q+@(en_5!~6k zc=YAJ6Oc9(Xi3EnLCCD#Pe`lw2J-rfHt@QN3Gn;4!RGzS2el877EPQiy2J?@J7hTu zS}}3*_RFIuK~v)=|9u6o7rt2h4>ak)b`&H4TTcP%z(FgDlb}Il@Ot4q0Z=m+bix?8 zH3jOCL3{S#ViZz>7JXpFTSX%})S&Jdyh9xa8oj_V@(3Ecg@-b}_AsczMQ;y-Tdzn> z1gQN#1mi%3Yq-MElc1*S$=e@~{(Sun(k#W<%mHeRY>5T9eR&epz9cR%k;W8%2yO$nj?vmU zi0}jr8zI6IQYJ&zu7K{c!4jU};uunN7JUG90B;_})n-J5C#Y?Y2v2Y+jWaw!BjAn2- z4YUpi)Rl#Y=QeOjjuxKJK*OHk*;SOaHIR}V6i?tW6wHAi*q{dJXlPJId|31eH2r?_ zD6TFLV!$5M1%d|?o@E+*u;#Kj4Vueo80EYE(TkwbO;F8`5=M|p=_*Q4vVpts@VW>- z9{jTCDoV11jz?cCx{5uIgU|0l9zDMJ`U4TAB|P-XK&$6a289tlcTi;m@42TzR?mSK zVPN)4e+YuN^SUd5`s^%3_SsPe=<^cLLj>AShE8!I>JQN9$7N8j8hQ)e%YUe)9(WQF zJP&oU=o8yf;&LfyNF5$1+rXnIv{DHkE|69>sJ9O8*kB79$e=WMRPN+(4bD@O;GZNt4AK9hNkd8pw`gi zqu~A{XowoNoa0x%djeui`QqzOkn#&uTwyJ@P+EYHkq?wS`Lh5~Zh_`?5TOSydc-fAW!l1?a&mwTQ8Ii&uZSsGI zL9IMcng`dbD1(`h1L%L{w}aFE$)XRCWdHK@Hv+)~YU{xh8G+&*G0ACa6<3WGFI z`yE_bf{vmCM-oy=0A3^l9w@$8^obK(FcD73DAAJ#ntep6KM<)Hv=a`Viot88!ATf& z0uyEujstC2b~gZp2nUfN0%~W&Lu4B`>Eq1fpu~&FH56FZSXgHnq?TJvbN2qJVqZ9RDCKvqseYY`lI96Xc)Zab5Z!C8^Vq#><&oM8m& z2_wP?+?of63gk8d%(MeJXbs#RzF71LGEz@C`Jk4A;MP3OFaoVXK!g#vH4hCV?2|f> zjHoU^eOg3y z0j&aXWHivwJtCOE4SHxWff|jNc?~k=4cZS6YIvUn&D0VXQlOa=conb>yv_vF!bfV{ zzXSCKu&kd1kJUg!N*=j)1zki5TW!;B@+PSFPhe8@A!vaEbRo7V4XOb2@?Rc2 z3JYJ632(fEn@B~MLFFzy8A00j4~s5C@0J5)I(Q@hhhQ4$Xa`8M9Td@^l93^3K2B>KTFD9U~ zo6yrv(8~yqL+yOL{1gTRlzW#RfBBE6~sfh5Sv>!cx#~P0+*<%F1r|bP{B-#iKWIbeKMXifhn7&vj5wfXJf#I!Z$U zvWyT^v>z$ikJ9-kdp%KJ@w(oh5-nMR3=D%-8!An$N z!|wQJ#_xe50KA6)r4<5RT?k$}AcA#DIt_FTP&jC` zBxra8zQ6#yy6_mtqtaNGg8UFH1C7?Z8-VH$R`4RImpB&pA^Inv6p5%m;G;R9>sa7P z2RiusLlCmW5Y`Uo#_LyTK?_}Hg=jp1Tl+ZMn4oqz+`F*R9I%gNkbMlTh<^xf11Dl| zDgF@D#ssa}z%_=Ah+R<23Ent>EO7xh2|y=2!W$Ccg~p&&sj$W2u(mZ9UVkIv7__t) z-bR4znglfjkxEg-AS$St0QWa!WeaG6+&S!91h7ybMY+y*ZC!ENiCMIV@njBL=L zCp@wt3tm7)#ZT}meMq?oDk>214KDVP+RMcDN}Fs9OQ|I%E+!)a!D{ zUPnYZxafzqmmh-K%f!VxXf^^~sKQn&UImR^q70cpr-Ob7t^>F8k=no<;9$nJun^Ii z1dWKpBYqjAnm}DFh*-i48rFsT9?}(n`d%K{_lSrGxAQ@#V}W`L7mGeHf>Yy59P8x} zJpfRP4IcN1YT{?n1(ZHFW;Fqx+PVqqb`$OafQnb}R!m_UR1@F>ni1!FqSY6m{s6Q^ z3Th0&$00#A{L7-tpdkmSG^oA+DFN?ff~y197od3-Xng_c4}cOfs6T+cZbXbiBW*PT zPxC^XK%j&L?(u>48bM2jAA)(H`U13x4>bOaFYQ7j<7LqWl+pP($ZAJ$Gap-Vl?K{N z6b>4_`NRVry}>nU3(Zr|G8&#_%Rp<>Q5Hwx>d!!Ccfq^kv9__n9Ru)K`%O?j1tlO{ zRT5%?0hChVQ3aXw1w|Kltv6D1eF0V0pz0mcDFCnjeF&Ok0EH;7aZQ9rKs`Np#|)*v zAcl2=7QDZZ0IFWOz(oOJzknKY@LY$|UjW^ZhF-&B>o0(AJ_D;mYO!9teG$|U0c{(8 z1Uk_TJURq!H-XP5zWMJFc+tYmqN{w!!=Ok@)*tl zUKV`=dEnyFi?6}URXETW&)xvV0jR-r4OF!t3RiHw0UC@r3EF>wG$Mg0T)~yXul(td z_5i4F0&0bx#L)^x6t18VD|kwURTto)Oi(ufoY_!E<6+eWxD9@?=mR865FUsEP5Hom zv<=*c0Qm^XGl;?!)CPw)C?NF)v_XMmu_?Gx_?15$+H=jF3tB=U4H$t3MsF5T^s04+=e zB@a;PNn8sW;eOC;G1&c}`VNcxL9+?r4o7c-ayh6Dz?CWx?gy=Vfx92FgdW-~5<^bs za4$e6iDCT!F1(AY5U~InkB9pLGRXkS;^2xEDWAh50y2pJ?g!i~`osgS0|@&Cr5}JY zw*aaGu#{NfECg=bpDg;o2d=>h7gwMeDY#!?a|aQ3hM`uu#Eww_X@q%oNjy;ZGRSLMIe+ZgEB+f^m3Kiia@S-B9k3_Ni2-X7uC)0~XA0XoZgna~RO2FGf zC{>Cu^4KY6l>%P&cC+XcXw2u~QCt}gQl)?jU~rWpNSi7JQJ?(Emxs?ufYz+Try?E} zT?Xxmf+srA2@a4E^(Wx-zkUd+9R)8`{godMIvEADTJ^|(@YEo9oDsBS3AB#)C1~lv z4?#E3p#TR!$Jc?!7VjK=`0oKIydaG+aBl;=qZ6r~0VUVl4?#U!&<0viUGxW}`U>dy z1B0VK3vPoaIWzzyNy-v`__nMgnLA2eCjO zv2NlY=m6c{pgm?cK0p4^1@*IkHe+a@R8{mtQ(R>Q22Oxz! z%4h|6g#u{r0BE-YJRaa1r||g~GRE{PKOS^i4Y({O;ANDB`Ctu4j{ZdTHKdFO54wZn znV`oZM|gu9@HqVr&IGVwbK>F}I*x?b`_OTu`QT;5GL+0|8bdg9}O`0|8S0f+iJk zrUY2|3r-P41_F4-7d;R_1;Y=HqmYabS=#d}e>u2009FbQiJP}i9{mYg*atfD<>x=p zAuv}!XW~N-*9UEU_*wju1$>e$c)uJd#eW2yoC#Ww4?9j499STkhyOr3ogwS;)A;3~ z=N+Pr=a?Toi99cJvgn#1s7(#p19S_#X%#e#133e<=rd@;8EA9!CD6Ph$I+Ajz$?;N zz>J>-7eKpLFN0RfKs)>=K~DPzIt?aF0JN<@7`ey)3NlC!J_H6_u-+{C0I7K%zJ7u@ zw+bFT;6>q}O%o5nrz#l^-8)^kva!(2NN&UH~*FLi{FZ@D{^6;Hy_K6W0&HW#EkYD<4`^7kvf|I^cHY6?mcn z%{9Rrc+exh{v89Q8zJNd-Y1X-P`ca(oilC+hx|oQ7X;J@fB5<>mXk++7Q6$cXV4KV z7ns3IGN2pdAP3)q+A-kn6jmRBOZ;E?^TDGaFN;2dTJgmA2sFTs@DaFD05w@37F|Z^ z9U^=Lt^+_L>U3P?z!1cthe8{CA@Npf&KH^4+BXCs$^%3YeKP)~1*Brm{ zw}Vq3d_eH$YtSjnzaW(dQc(p@b=N@a4H0<*dScDdqu|pjkj9D;c>}!J3wB8wc5Z^I9Y`f}`Y8D9hmZemAN>d#srd<#{~5{1Ud)iBlys0 zXmx{VzoXSRp#1_DA?<3=I)IBsS3$Scfdd4yz5&f9fJQ|xfzHta-4+2pRRgqu<<(JG zeFK_0fJW&>aD8(Obk-rbzQLYK%fR=_fQETKKnBln6?4x(IqnH)6D+8_hMqt9Ll9iw z+y&J)QjjBML1!X?mjHu~)&d{#2tM2HWzlEQbOP>Vi8!7bG~5WE#|wk3E(ITajA-+K z&KU!36#gLy*@z2T;PZhAJZSOo^*6+U6X0?UTQyP!8XbbvA^1x)L^X(1mBUYlgp|7? z*y|8*>lRjrK*nuw1@2RL;DRQz;FTeyp8%~xkQRC%*N5O@>{otz!O@pRU-0LAxL>ZI z)`#F~2sC+gv*-$FFc3=}0xnvR#$rH&s5ima)ZGO2GC?Ezh}&gBJV=uQbkPXtXe8)5 zesJRvd|uM8e8?TlAjzMg`~sRW1I;ji_ke@)0?LXcY#9SGvG6N@KDfpJH{o9veTHQc zaNYp*KaobGKz6)@9F+(kw1Qkj1R8(?7vx9-a4$g944_l;Ae9PiH035}y*5S#idoY< z1RsluXqZ9HnuNwBsC)(Ia+J2&HgKi#D<9e=Ae3X^(TRLe;}1dT1@MrhiE{2AqS%9t z4}qfA{7(_2W@4LCMm65k{MWguwT1j<05RZ7t6Mg)7409M_AYk`ZPNfhFm z1fV>Gs6dgc8ypoVthxdBOJ5d!!=H^1)eTxUg7^qr-QdbbASJ)@r-K_i(Cu!-R5yG_ zNvUpd&p%-HZ-jC8Z$Q%pg!(r~eFOCVjU?7mF%MDSuoKxo05z+jl_a#jxdp0krh8U)QofV1{p$W9n&HHfHh@Fy-rH3*tbfmegj(;@yH0kxn((|h1D8nn+FaqOjeU_7;o)u^XDtbZ?z+UKp3meeIaG+NE z&7uzicvC4NIe_{h@ZtJR9uT z1urbY)e7|QU|E8N2CsLR|uLqM6s3& zkbVxVZ*dbez)yT7i%1>d0WxUnfL zRY~BQ;$qPUA@D*d!aX3+Y5mx%B+%+kaCw2+r-M~V;8gOm=o^u})N7zQOhmXN_kcj{ zb8xAFIywOB0f7?|bSVolRT3{LRT8f8Hq8Ey2%=O64b&r#w?Ts7;%j{UAJ76DXy$)W8yh(N1v9)s56@_+^fp*v+k%gDiem~z?#>PIJo)x5tidSq0MVh{RW>ygxoUV6;P7ZA5nFX91(F-T+kpNqVyBxZ1=nGj<0Xei8R5;;`4ajM7uysS^#tCFM z2DJOL99-vuA`IUBL5mrrebzUzMGm+Kg4G{n#}A}@`F91>t^nl_Eb#*_83@)NIHCwT z+O`~8e~=kP;93ZO6oIRlU-{F)l|R{01nw9?qi8+2hCqoT^d2g>x&ep%MbNf(cxwbw zbU`+2{K}sXjw-M+_;}+*tY@MkWg$cl60>scLvgscJwAc5#lk zVfKGSao0DH`H_>a@y+jmdVtXS2GajQ)Hk42E#N@FTHnAIQKPgSz@ygKT8rS-SdeKO zND+*yeU9iqptKXg^$niR19oawQ@NL7i@dE5Riz)RmxhM_57;TqeT$G#5b=S{MT<;Nk&m zK!Qs_SUp2tb^gO15+V=kC#QCL3_z@_ofhG30lpE za3!RYf`lb#L<>udgDWQ3_3I}=Oa4J!VDzx$#hR(N!FovG2zgob8FZ=>ZdW4GC2F4! zTy25sY}_LPuxbk&dDuq;eimKe0X2LeoS@aEbXcBH$BC0LW%p~?663QBU z#9%3`hXjuKhoD8qplykS=ZLsTsMeq)AAZ8{&i`?KH?KH^Ir$0f*Il@~#NMkTzp#D)jXgA1b z$i8dz(HHFfYw+p>$m#(hyv{<5yMSgQ5Y7q!A07(cwSm=Hpw$WS0Y@)_#@0dk7Ui%C z@M1Hpy)n?}ojZ6B2L9n7L}-D=pyAGfRAbf5RD?FIE8eSe&s{9G+!+G!UV35QGyWMQ^6L5@Gca% z-}4Z(kQua66un!^g}+;i+Jyo;8Kr%Ww+khP@D(KMz-N%a6)~)R4jG3->EeR35q#4V zsF9A?w6Y#t|K2S6ia#GcfX(n-Maf5yHEW@2vY_$yBxdOQoXU>`iu9RSX9TM{wWb+DA z|Aow42AW-gx)OYU9jMZ{1-+&DALww|Y2ZrfSN?o(jYD!q1DXFTza1PQH$h8Ohzm=! zj0Q3V>PlQ04W#5({&H~Sk(|*Wnx{kJ43g1K9=%C$Ndp%t)fK-yXv_uUd@Qv6kl>mh zzIzUmZ$R}`IOy1`Pn?)TJ>WTHr1|;}pj-o59#jUcui!(jDAgQN4Tsoo0NQ5(ABW0= zoR$SX&<&oE!E0zBo2?>fQ;^e?+pCcDcz9@cEpLLD7_I#w+qyQ2Ja$AnvZ~b1$34^Xl4c0XL<}&$bPGGmHfF>Yc5qP*+i7jzx~%|L;? zh?2695)~rBgBC-=6Fdo*PfQ2bV9>=+Xos)9EV=|bl>sy5L#_-$>IY%jkO6MHBRLhG z{?~y^&|mq}!Ct*t^a)aT63zvneh@qtpzOoMabg;{5Jc(+LHd*^8=HO>T>$mZv1A8W zl?LtyT?F;dK@mYPM_fi3;(*j`pd9hA=rYQf5F$rFJ6!GHTInTdh2~4pGUV5w8*6`o zwl#n!yph^3$gMJ1l?D#elb~(@Mp}DVbQ#pu1IHrh8W%+K0M-Ko*EBbaJ_v(z1j@i6 z*2!;_R`)XS2qLsf!!hdyo{9REzZ@JP7mL1NIV;lo0pLGuN$cL zN(bGGaR79eE_C%6%C>8;2jB%Uqy~2b?c59xfSjlQu;>#(58V6*y3XzwXf@Ua&~6}5 zCoCO0K?QHVLmc?;z)^G$Ko0{$b31HqDjg;bTl_}Q`H(3o*jhGltDg){KsIi|Iz!+Y z1A<;@19w=!Hlq6m+ynWQ51AE$s36`$;PM+*+kiEb;~{9BwH)jeB0U5yzri-5dk8$< z_bVSV?*mamyobOAE^ICbteG4SflE{P-V#u5Asj>CLKJKxx`)6eo?Gc6bfo_71KmZLTp!7t37F`fV-gJ$085yJl z1v$m!Bxs8=MvL!d(Ip<_6-{Y%b_w(Ff3lkhrgmz;$c^xRL@0 zsg-VmZXW@85v7g)vgi_MUpcs43YsNFw1}aVQ9Ib4hoH5B*!qY9psf?2M2vcX7`RsY zl@ICHoCGycu!RpNa`+&6U*Pf`sb3=wP6C8`UnqS<$l^X|A5j|Fm57bz;8Gl^4+J?p z4?TQ9%fsN|^BA;S>IvxLaB!iG)UUyEzRb^}3!p|nxG+J<%;>w#vS3nbyV4JZo#Q-+~9u|Fq zTq{VpN5xJ`bw}`il#4}IL36>-$cOYUpyPu#L5FIA3XF@dpMb8&0_}Ht16ue7$`F4+ zo1Wf4uF$^t50oLmrzt~DHv=sfM=Q{N7G2;&&V=xR8pz!#plPa4Lf{mFK12hmA;7^0 z>Ndj*3GnJa$TXD%xWkN+q>*Yc#F3Yv;sL&-Ck=F#8u;uQl=VDsKs69(#|dnF5V}JE zGu4BRLjpSsG#Ux7Rlo-=UHUZlP3Is$=0>0-||3v<~|OscCPb zE#zdw-_b{%@dDSW@Lmk)78OvY!G4PK&7v!yK{0UVx(sbh!e+c6tK`6KWayG#%n=CC z$!6eVryhgPbwkW}L3U+61g%NNsFY6@T|>!f(6ibQceqRk*RMZ8B@Sp7>m>4BMK3`| z{$d>x1=j&c^Fw6iI>>k-G^t_Fc(4T;)4??siP;Zwx+l{75Gk1uc8?dNM@4d0gv|N0 zgKI4Evm(e8X!w9Oj)9X7XxIuda0{-~e&x>x*H|QHMUeTw^4q~R7Rgx=qIo&E#v(Z@ zLNreYcUvEV#+yM)I0#pFY@}3oSni7f_3xlX9VlEM7F`CNHvkP+P}YU_@8H*GpsWW( zUGM?E0us8P2s$4I-;Ib~)C(ea9pGnbfmgdj`gi=8jXrRv0jax!=tzN9b3j%Hih!$0 z^o|re)`k>#HAgsjnoAP1$VQqQLL6xdT0#UL;7}E10bhP_=>?8}q=G z6VkjD8@RGW={fx@x&Ye#1I++G3lIehw8IDKv)wHE1iDHI?bt2wj2YILfp+*H=Mp>w z&6r_SOyD68q~-`BX22B^tRHr==#wZog`+nF*|7H(p(B;;U_)*eeUJjDbCk@86wrv~ zAn4#c#Ec@iwt@B*L8qgF!w0ob3#|bmeYT6B^Ybz0qQJdrq$yfNKMdRp`jrovUxKf* zLvbZ|AuWB_5(p)$(ACFv#9y@ajVmaM_5`S$_z+v>GY< z!aEX>-XLi8Av6b|OwYb7y2OGs1Ps{=2F|?~j)D$|0w0k99$N=5E`)R*jzCV=1BE!! z$#t;3V5nQQK#G0{!sgqigKI%>bizv^^d<|&94Tzd4Zd;^OB_L04#G1S(&iRWy#+t# z0d!ywsONAK`Os$Y%1vxB1v!NYc6P%{P}P7fgJF*^*n&sMc?=heK9dq_@CA?HY7t&~ zkr;Q&!1W?D`XDO+@x>t0o-wSE2(G4JbsedZ2(9ZNGhi1%Q!5yyB<@HAS8>1c=YtLT zNn#{ID?dp6MQS91tDax^^TCFY8i}yV4{Qk8kqE8@VUY+n5^p4eYZ7qmV2wm@UGXb_ zI>c1asePp6L`bo}9Bc=vkqE9Ue&x>xyMWY4gbj9rYb#PC5jNNfcEQD>FIaNo&7vzD z_^Udw3m$^n5ybWWSV*bru&&<+_5Yv^a8SehVbNvKJ|}P)hdI9onGr)TAV348-~}h3 z-47V^d+JhDGZ?`3fEp`^fkjY#2kHNDf_pD0^LsCgE`bJpz|9m;M+-5q2<`vf z1T|1VtsC^FBT8=*F|Y_)UlI_Nk)_35wo=ElG8_?tieDu>j0d$`^XjmF;ND;ju z51&v0kA8+1fM!Y|>-tdYa`4y^QVjv0N&%04Le|coEcze;Zv3OSPg#+>xA4hQ*m6n8 zJlw;gPtuq{iqzahOqqf@xrlib@Bk=ipUlgmOE~6HQ2Kv3=TShD4B#k7okszWUBXs} zz?)U*5f56(2#n}>S{ zIt>$BpN_x`Ah=_A5i|n`s;W?$TQ7?)@gf%&h~5&kl7#o^AZy~#lROvJ0T#&pMzED0 z&}KDeZ;2gyZwXpSLh8AbMIR)wrbZUz)QIRUf$KL|KkZ`CCmGBvhm_k8>vutO*N9mw za7722wF38+kTMLs@eA1u0PE8|1of7%#SF@4z?r)T=zaK`XmLuY@6_)DKjb6CTP1h?(?cZ z^(X51Dblz!q+;>(7gzBnxsRdcpziJOmfrpxPRqaKNi&AmgXli%#r= z3Xr|r;QAB$Tqr1wgKa|@VuID57eVW5L5@U8Cg8z0r1ArvDZ!(ukopsTS`vC$$%i9H zLOOQv`V&&yqvuG_Nhg>=3L8*`kBLHBgy;ba8vFoz6(wL{$NNIgfV~Nt;m4LEIk0yA z;ay7byxvPtn*m#nWW$joA%2FO)q1h$vlM12{Sb6SAyQBy8GNk|y2C&X6mV9&16?}}IeZb; zE4&HX_XzS3O7eMFbQ#nHhFXezZVa@xgv{0b1g#eUjSD<{4O(pc8+1r4czz~~RL7G70KOmaq%nH&38vg;& zBxhETreFEX!8PE;qAyr7>&>DoEcmNWa5ef6lv#=Ebuy7$ed4Y^F~?hxZeB;JKjHW5 zppUnL?vKYi-U`_Tj2@Su^`GE8i&F4{Pk#&tjkkV+^mb7iLEvk%vE>=qcq{hXR(}>< zz~0yd@6LntKe22#0WZ!(DrONi8fdjCWEl+`*0urC8eQZqgrM2T@B+}54$ukSn0W(x zNfZZ~jfB*nIEx{a&OE$%09j`Wn^U}5^a*m{K6;RfVXwu&-A+)u5>%%=1RZLNtr!Ar z{sKoMN>c&0`2jLdcoEc=2IXh;1_R2u$nfBW-Dd?^+lg~X0n{x7+l<40p2JlcUtWSxv6)%Xr6%Xx@!uPE4gVP3jXk(u+fE}X`=~LpIp#&{s z2KyN`w82#?tWSBd=o6NC9`wN!L}-HtT#@GfAm=KeRI%WdqS%_dklpF9KIOxrPgqup z;#@QUuVTTqCC)-0v{e)AXOw~hUd4iI$(x{ASy0OWJ+wicaBw`M_L;z%VEshw%VB;N zUEsvJBoJQ3g8PXVi#}oLGl2(_uoWHfDi++?ya{R>f}D*W+Uz*m!;tC*zDh+Bb3Ex~ z(IwD=F|eOex)|^|OmJuOB4{QWlqJ!NWmfEI8>|U*#t+B^H;XsC+r{s8Yg3<%>t> z_oF}aPc;8jDA(^i`CsKM0|NttM{gSr^A3T`b3!-Iqx0HJkUqG3M)|`#JbJrO)5p(( zlm9(>oeYlt5ac=v<}@F2IQm1-3v|Ng$^S=x799K!;ywVSwWB`-89Xdcm7Y8dOD~}G z=+XHdnk1WlD)6`eWMp7Kq{q?pG`!OjTKpb{p5p;4S4PvvX!(jU|>MT{}~tP|3}91|Ns9-#`gdJ|3}94|NsAoV+MC;XDbDb(4^A5l2Qc|Jxe`9T|=`{ zFxN~IA^=J$A|L{kHmZUc7%K!ArFqynKvRkW3=9l1P&p4M4H7eevO(zy!~}`CfEXY% zpfpG<0?Kw~fiOT~IS?kpX(TbwqCJqXJ(AcIkN^V%!*L|B6;QEoB(XhEvHM73pn(*S zusM?06OaG{1H*nKu^&*eY$P!rsHZ+7iGhalK}yq+#7v;--Xn>5K*j2j#A2Xg?5rRY z7#Kid0P4GgcRv#_HXy0{ z0~NDCQYXL!GL3;@GLkwKsF*a8It!@SDI|41P_ZB+bqP?hlSt}HpkjeY>N=oePmt8j zfr{lIsoMY*lSfi_1S+;1N$dtx>>rZY8>ms2B$u zB+Y^H#{{TYGLpI_P_frYVmqK>vyjBjK*f}h%zFS8n~9|E3sg)INgW4NehZWaxkU!b zwn0*702MnA%{w4M>1~)R4ffjgVe2ovQwcnNbCTV-2tURVppK-WGD?1djVyy zg3=(dKT!5cC=C)505yUb7#PkViK#%v;*rEGpknWk#C)J)Z;`|jpkgp{L3Wft#a<(+ z>wtIZ(DClm>}yfU-lNG)U|Sl&ucQM&Nqp22>1Iu7cFPfr=?W%R!JB0|!JC zOpHOVxH7jSF^NI1xTFX|XTVrRsX2*yC8-r940N>M-i-s101JdCX`SE(;N&6U>ZT_fSDW&uyz`l%fP^31ZHqBIDlycVF6}x zFa&^U1Yrkeaxf%-X#@dj>v1p?AcVjyFEGi$0Bg^IxeN?Yb2%75lX@^Q5FG|07&sU} z3rN9CC=mmuIT%1|L%~cakp!kW7(laZU?!Bv0Mi@{uy!Aq%fP^p2WD_Efab=)Oej$T zra2fuYe&FLC{YEbIUt=RFdITZ{KLTjnyi9|K*%;Q#lZk;AA-3I3=BPB1_uLZ%_Eoz zB_@Gs4hGPa6PO7lW`Jo92GGP1mvP$&9afFILkcNoE`X#|oaRq1whV@ONpyCfyAnIZLlN_jcfF?v7)<-FZiR(ec zVf~a=sQ3X$#%F-_ReGS}8_Xf#S2^^;;_EU zKB%|?q(ESR^y!pP8d`?08)T3!1_9IQ1K0|5cRPBP9ao$K^H_E)>f>9iVHvsP*}gG2P$p= z?JUFkK2xCL4qy6uoxl^>jxczi9y$9kuWnPekVW*90ph) zNg66104Z=7VErULsQ84-5OZLCB{Qgaz)gratiR+56<=@5SikB5RQ!M#L>$()x&am6AOjJH^{-w- z#RHTe;;?r9XQ+4ss9v!2e#TA?&=EKBYpyD5#A>y$9moHR2!4o15>w|?s#V`0l#9{rgc&K$&HYlezn$byK&`er>)aRq1v z59^+5h4!jt8IpgA83b&!{Tu-RQy9PL>v}h zpfNa*y$Vwx;;??(MX36MSrBnp-|aS3{J?yOIIREn6e@183?dHe!-480kogOsgIlnE z+)tSL%@Fmlz8o_vBwa1o2@!|&=Xjyw0ni2ttWPHn6=ygOQ4j0aDMH0BoP~(P`gYn- zaRcZe8mxb33Kdtl4N(v4-IIQ1S2vz?9RPiw|Fu?kL)lhK-Hb^HA*8l5(if`b6h=b}sP>C}S zDt>_rRh*$XGdCqOuOv0SD6u3pB`LF{ATc>RGcR2)nL*FT*#tzGg9t+dFlh`X&A_B3 zh%_<+lcr$O0z?`cf=RGeW3XNmuwE0eUK6lh6R=)WuwGNJUQ@7MGq7GWuwFB;UURTs zbFf}>uwHYpUJI~Z3$R`buwF~BUP}{(%+frNCkz-eK~#BWNk)EYNp50sHdF*kAxRiQ zltL*a2_uLEltPj)hDbmuBncCU1e8LOFoj4!DI^Iqhy;{Ek}!uzKq({%3y1`iLXxnA zNI)qh2}7u#VKlN7)Ypbk8d(bJZ$l`JECuzsA(Tdzg8JPMN+U}_eQyY*k)@#iH-ys2 zQqT}Egwn`T&@eEB(#TTKP%wnjNK!`7(16j%QqWK^g3`!R&`>af(#TTKP%whh$WqWy zFoM#^QqWK^g3`!R&`>af(#TTKP%whh$WqWyFoM#^QqWK^g3?G*#?a7!(a2KJP%wtl z$WqWyFgAkaH$xZ$E@BK5f#*q6n1~sSVGd(hz!;V=1}L#YvN9-bLRoObK`9R^1~(p* zuApLY3qYv`Dh9U#lqMizpd0qQ0~DW7S-2gb*n*0|?Eu9ER19thD5N1`pn!$4 z;C2`rflFS9aEc+AgH~E0iKZBW%5V^d^*2GKIB3-A&wmKG3+=yx#6hD@FmV-VKNTd- z!@$4*6L*33M?vBstuXOQXulIA&WEJ_3ABF+5(mu}!_-GY`;j1V&^RMZd<=dX1lk`0iOV9Hvjy7k0g1~Yi(dxCJ_FcZ(EL2ioFmYF3`jjFN@3zlp#2q)IB0AU zCO!q)F9C@wA({W>Kf-)vB=HnzKLey5G*kgI=Lxhw0TNe5QeOh?H-N-JW0x@XF3|n~ zNF3CLfr*p|kk>0AZct_O)Dr*jr)yB;KtoX(#>+w~xE zG;>hV-1=_9$i6f`;3TV3?B#xZUk3ie?AaUe$ zZUSxBgT#^3`4VWm9wd&O&P|~0dXPABI$r{9*Mr27(|HKAT@MmRPUkGpc0EWOIh|jD zw(CLS$m!e!+O7wQBd2o}XuBRHj-1X#pzV5)IC45yfwt>G;>hV-1lq0#i6f`;5NNv| zB#xZUr$F2FAaUe$ZUb$8Ac=$K9$?~Aki;vI%vpjYUV$XO1xdUdN&EA_nSTXIyc9|O6C`oa z90^SQ7bNjwB=s!NbO?%tA|!DUByrFXD$E=eByrH#JWSjKNjx9P92X>U&|D5oeF&0x zE|U5yNaC4D;!lvoK|{bWbG{&ngQgr|;w+pXA3@SpGLktWNaBe|;wnhu@krt(NaCQO zYMA*hNaE2*>O+vkBay^Yki^50#7mIGLy^Q=kiM(n!Ac+Sesb7L5?vEtC1xef& zN&EO+uJES)k<@C>ETN)QcdAgQg;2>Q#`$LF*A= z;wDJqwn*lnIsc%6NH%1blf+P-Vw8P9zS z7bJ1eTpUb11W7y&$($4{R{*Ad3X(WzE*U1i1W7yu$($`n z;z3B_N07upbBr)^t{{nnx+XC3CrIKxNalP&5(iCv!PK+xAi^IsrwJ1mK@xXEGDihT z95mMsQ*VMK?u4Y?1xXwZc%y zn<1%Rf+TK&B)$bn9MolmnR5h59JFQuCVmA;95l5F6Musw4(jT`#6fK!5C)BsfE#cw@P;uBg5m-Db zK*gcmYq)v=s5oTZ17tY|$b1H6R`CxF!2pgaS4zBk~s^Y;;?ltAZd_)CqTua(|%y!xCKZ6 z$@~PUICOdsZcYGH9Ja0+qzz=g15_NgUIr#^02Pk_2_TuT02PO=qX9{S%n^W!XMhBd z)H6WEq1|-2dpux~OAoDLk#XCR(Nai1aio@39fTTg{H$cT<>%L*) z3!vh#bviKd2~hDBAOR%v8=&H_^*bPGkog5r@f{!mB=reUacK7%?%x2Y_z5)i4p4E> zdJa(11xbUacDOh?w$|O^798sA(HtI zpyJ>GFeLE{P;n>)H|GR2eqsG%kZ~aQJb>0K5+Dw;IZ$!fd>Ksr0jM}=y$EtRY=DXz zpqaA(Dh^vW4>M;1RNMhgeFIe715LaDDjt9)o&Xh(Kobvuio@37!|Zi{if5pyH-L&4 zpouF$#VgRn1)$;$XyOb|aoGGL%-#>scG3hi^$(!pGtk5@K*blJi64N9uRs&u02PPr z6M)&f04lx%P5lI@_yIKW2B`Q6H1Ptc_ysia1gJP{-vP|t0I2u_H1!Tp@fT>~22k-2 zXyOV`anQOMGeH>CmIKis zF%X8eKVa<>kQfNV+8Lm>J+gXOdjdv-)PgW{ zk=zpiYP2#iFu>|NkXjIit;h30Qx98@2dk$*YC#yb94m>S62gVC5c28iZl%@lrq>B=xZMc(C#cBn`r_^?0E5bD$^&iGeU| zJszx_0f~VyY(3r<5C_S8*m^uz`2dmzVc2@SJ0K2{df0kASiS~HgYX0BdOQ|reguhu zFl;@Z1iCo1!K;HN4qK1sfF=%Gj~9X_4qK0xfi4c6f2u(fhpoq(fF?cx+CW}{CJtMV zw*yTawjS>cnmBAd-UBpo*m}G#XyUN-cpT96yr3`xVc2>+88mU&dOQO(aoBo1(7G|? z^Z;9r7lEc8wjM7BO&q=+4^14l9&ZYoIBY%M3N&%pdb~Yo;;{927tq9E>+xXgNI_u$ z!m#ytKhV^}*5mO&w>W^*f-r17o&uUUY(1U{nmBAdo(GyZY&~8KnmBAdUICgoY&~8J znmBAd-V8Kx*m}G*XyUN-cn8qLVe9c=>p4Mw1!35FyccNdVe9e!pozoQ;|V|~V?btr zFl;@Z3Ys`1Wg>a9uKx&4&)XPhONh&gQgy~9&ZDh zIBY%M5j1hwdb}HG;s(%pyEka!3!w9M;7#O6 z4qJ~GfF=%GkC%cb4qJ~`fhG=HkJp1H4qK1608JdW9&ZbpIBY%M2{duodb~Sm;;{92 zAJD{M>+x8i1681~0b$sBJW!cL2?n+2Kvgv~y5L5^#+yDs+kG$z*ti&s51Pvb6(KP7 zFmVGtkSn2w1TZi#fac0U_CUSC02@aG&4Gf%p~W{$JA(meVLqIPV1U#jcnl1n{Y)S; z$<+@UYX_N$?tfzK2hB}@^h13McN$DTtbT?`fZPgWgJ@8=VGTc9h>0H1eJ7ys1Gy2V z9~OR~xfhWAptS-Z^I$jt+JS}jlVRep_=C})IVO;P*mxvNor4g>e$d=CObkSW)FRWM zIRKa#NF0V!q55Hb7>%wLG=~WigV+XB#lVmO)h__^FjNYS%K(~-#1?)PZV(e;{RB|> zf#$wp`eES*T6=+>c0g8DNI~?2=9EEv7zVi&#s<+KK13ymhlHgeq79(MBhYN1#lQgD cP6 +#include +#include "simdcomp.h" + + +int main() { + int N = 5000 * SIMDBlockSize; + __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (int gap = 1; gap <= 387420489; gap *= 3) { + printf(" gap = %u \n", gap); + for (int k = 0; k < N; ++k) + datain[k] = k * gap; + uint32_t offset = 0; + for (int k = 0; k * SIMDBlockSize < N; ++k) { + ///////////////////////////// + // First part works for general arrays (sorted or unsorted) + ///////////////////////////// + // we compute the bit width + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + // we read 128 integers at "datain + k * SIMDBlockSize" and + // write b 128-bit vectors at "buffer" + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer + simdunpack(buffer, backbuffer, b);//uncompressed + for (int j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack\n"); + return -2; + } + } + ///////////////////////////// + // next part assumes that the data is sorted (uses differential coding) + ///////////////////////////// + // we compute the bit width + const uint32_t b1 = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + // we read 128 integers at "datain + k * SIMDBlockSize" and + // write b1 128-bit vectors at "buffer" + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, + b1); + // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer + simdunpackd1(offset, buffer, backbuffer, b1); + for (int j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack d1\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} diff --git a/aux/simple8b.c b/aux/simple8b.c new file mode 100644 index 0000000..0bec2b7 --- /dev/null +++ b/aux/simple8b.c @@ -0,0 +1,333 @@ +// modified and speed optimized 64 bits version from: +// Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. +// Softw., Pract. Exper. 40(2): 131-147 (2010) +// http://ww2.cs.mu.oz.au/~alistair/coders-64bit/ + + #if defined(__x86_64__) || defined(__x86_32__) +static inline int bsr32(int x) { + int b = -1; + asm("bsrl %1,%0" : "+r" (b): "rm" (x) ); + return b + 1; +} + #else +static inline int bsr32(int x) { + return x?32 - __builtin_clz(x):0; +} + #endif + +#define WPUT(__x,__bit) { __bw |= (unsigned long long)(__x)<<__br; __br += __bit; } +#define WPUTZERO(__sel) { __bw = __br = 0; WPUT(__sel,4); } +#define WPUTFLUSH(__out) { *(typeof(__bw) *)__out = __bw; __out += sizeof(__bw)/sizeof(__out[0]); } + +#if 0 //WORD_SIZE==32 + #define CODE_TABLE \ + unsigned char sel2bit[]= { 0, 0, 0, 0, 0, 0, 0, 1 ,2,3,4,5,7,9,14,28}; \ + unsigned sel2elems[]= {256,120,90,60,50,40,32,28,14,9,7,5,4,3, 2, 1}; \ + + #define BIT_2_SEL \ + char bit2sel[]= { 0,7,8,9,10,11,12,12,13,13,14,14,14,14,14, \ + 15,15,15,15,15,15,15,15,15,15,15,15,15,15, \ + -1,-1,-1,-1}; + #define MAX_BIT 28 +#else +#define CODE_TABLE \ + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ \ +unsigned char sel2bit[]= { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60,61}; \ +unsigned sel2elems[]= {256,120,60,30,20,15,12,10, 8, 7, 6, 5, 4, 3, 2, 1}; \ +unsigned sellim[]= {256,120,60,60,60,60,60,60,56,56, 60, 60, 60, 60, 60, 60}; + +#define BIT_2_SEL char bit2sel[]= \ + {0,2,3,4,5,6,7,8, 9,10,10,11,11,12,12,12, \ + 13,13,13,13,13,14,14,14, 14,14,14,14,14,14,14,15, \ + 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, \ + 15,15,15,15,15,15,15,15, 15,15,15,15,15,-1, -1, -1, -1}; + + #define MAX_BIT 60 +#endif + +CODE_TABLE +BIT_2_SEL + +unsigned char *s8benco(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out) { + unsigned long long __bw; unsigned __br = 0; + unsigned char bits[0x100]; + int elems; + int i,j; + for (i = 0; i < n; i++) { + unsigned xb = in[i]; + bits[i] = bsr32(xb)+1; + } //CalcMinBits(in, bits, n); + int sel, bit,tmp; /*BLK_ENC_ADJUST*/ + for (i=0; i bit) { + tmp = bit2sel[bits[j]] ; + if(elems < sel2elems[ tmp ]) { + sel = tmp; + bit= sel2bit[sel]; + } else { + while ( elems < sel2elems[sel] ) sel++; + elems = sel2elems[sel]; + bit = sel2bit[sel]; + break; + } + } + elems++; + } + if (bit == 0) { /* can be downgrade to bit=1 */ + if (i+elems elems; sel++); + elems = sel2elems[sel]; + bit = sel2bit[sel]; + } else sel = 0; /* what a waste! */ + } else { + sel = bit2sel[bit]; + bit = sel2bit[sel]; + } + WPUTZERO(sel); + if (bit) { + for ( ; elems ; elems--, i++) WPUT(in[i],bit); + } else + i += elems; + WPUTFLUSH(out); + } + return out; +} + +#define MSK(__x) ((1ul<<__x)-1) +unsigned char *s8bdeco(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out) { + unsigned char *ip = in; + unsigned i,*_out = out,*out_ = out+n; + while(out < out_) { + unsigned long long w = *(unsigned long long *)ip; + switch(w & 15) { + #if 0 + case 0: ip+=8; for(i=0; i<256; i++) out[i]= 1; out += 256; break; + #else + case 0: { int r = (w>>4)&0xf; ip++; if(r == 0xf) { r = (w>>8)&0xff; ip++; } while(r-->=0) *out++=0; } break; + #endif + + case 1: ip+=8; + for(i=0; i<120; i++) out[i]= 1; out += 120; + break; + case 2: ip+=8; + out[ 0]= (w >> 4) & MSK(1); + out[ 1]= (w >> 5) & MSK(1); + out[ 2]= (w >> 6) & MSK(1); + out[ 3]= (w >> 7) & MSK(1); + out[ 4]= (w >> 8) & MSK(1); + out[ 5]= (w >> 9) & MSK(1); + out[ 6]= (w >> 10) & MSK(1); + out[ 7]= (w >> 11) & MSK(1); + out[ 8]= (w >> 12) & MSK(1); + out[ 9]= (w >> 13) & MSK(1); + out[10]= (w >> 14) & MSK(1); + out[11]= (w >> 15) & MSK(1); + out[12]= (w >> 16) & MSK(1); + out[13]= (w >> 17) & MSK(1); + out[14]= (w >> 18) & MSK(1); + out[15]= (w >> 19) & MSK(1); + out[16]= (w >> 20) & MSK(1); + out[17]= (w >> 21) & MSK(1); + out[18]= (w >> 22) & MSK(1); + out[19]= (w >> 23) & MSK(1); + out[20]= (w >> 24) & MSK(1); + out[21]= (w >> 25) & MSK(1); + out[22]= (w >> 26) & MSK(1); + out[23]= (w >> 27) & MSK(1); + out[24]= (w >> 28) & MSK(1); + out[25]= (w >> 29) & MSK(1); + out[26]= (w >> 30) & MSK(1); + out[27]= (w >> 31) & MSK(1); + out[28]= (w >> 32) & MSK(1); + out[29]= (w >> 33) & MSK(1); + out[30]= (w >> 34) & MSK(1); + out[31]= (w >> 35) & MSK(1); + out[32]= (w >> 36) & MSK(1); + out[33]= (w >> 37) & MSK(1); + out[34]= (w >> 38) & MSK(1); + out[35]= (w >> 39) & MSK(1); + out[36]= (w >> 40) & MSK(1); + out[37]= (w >> 41) & MSK(1); + out[38]= (w >> 42) & MSK(1); + out[39]= (w >> 43) & MSK(1); + out[40]= (w >> 44) & MSK(1); + out[41]= (w >> 45) & MSK(1); + out[42]= (w >> 46) & MSK(1); + out[43]= (w >> 47) & MSK(1); + out[44]= (w >> 48) & MSK(1); + out[45]= (w >> 49) & MSK(1); + out[46]= (w >> 50) & MSK(1); + out[47]= (w >> 51) & MSK(1); + out[48]= (w >> 52) & MSK(1); + out[49]= (w >> 53) & MSK(1); + out[50]= (w >> 54) & MSK(1); + out[51]= (w >> 55) & MSK(1); + out[52]= (w >> 56) & MSK(1); + out[53]= (w >> 57) & MSK(1); + out[54]= (w >> 58) & MSK(1); + out[55]= (w >> 59) & MSK(1); + out[56]= (w >> 60) & MSK(1); + out[57]= (w >> 61) & MSK(1); + out[58]= (w >> 62) & MSK(1); + out[59]= (w >> 63) & MSK(1); out += 60; + break; + case 3: ip+=8; + out[ 0]= (w >> 4) & MSK(2); + out[ 1]= (w >> 6) & MSK(2); + out[ 2]= (w >> 8) & MSK(2); + out[ 3]= (w >> 10) & MSK(2); + out[ 4]= (w >> 12) & MSK(2); + out[ 5]= (w >> 14) & MSK(2); + out[ 6]= (w >> 16) & MSK(2); + out[ 7]= (w >> 18) & MSK(2); + out[ 8]= (w >> 20) & MSK(2); + out[ 9]= (w >> 22) & MSK(2); + out[10]= (w >> 24) & MSK(2); + out[11]= (w >> 26) & MSK(2); + out[12]= (w >> 28) & MSK(2); + out[13]= (w >> 30) & MSK(2); + out[14]= (w >> 32) & MSK(2); + out[15]= (w >> 34) & MSK(2); + out[16]= (w >> 36) & MSK(2); + out[17]= (w >> 38) & MSK(2); + out[18]= (w >> 40) & MSK(2); + out[19]= (w >> 42) & MSK(2); + out[20]= (w >> 44) & MSK(2); + out[21]= (w >> 46) & MSK(2); + out[22]= (w >> 48) & MSK(2); + out[23]= (w >> 50) & MSK(2); + out[24]= (w >> 52) & MSK(2); + out[25]= (w >> 54) & MSK(2); + out[26]= (w >> 56) & MSK(2); + out[27]= (w >> 58) & MSK(2); + out[28]= (w >> 60) & MSK(2); + out[29]= (w >> 62) & MSK(2); out += 30; + break; + case 4: ip+=8; + out[ 0]= (w >> 4) & MSK(3); + out[ 1]= (w >> 7) & MSK(3); + out[ 2]= (w >> 10) & MSK(3); + out[ 3]= (w >> 13) & MSK(3); + out[ 4]= (w >> 16) & MSK(3); + out[ 5]= (w >> 19) & MSK(3); + out[ 6]= (w >> 22) & MSK(3); + out[ 7]= (w >> 25) & MSK(3); + out[ 8]= (w >> 28) & MSK(3); + out[ 9]= (w >> 31) & MSK(3); + out[10]= (w >> 34) & MSK(3); + out[11]= (w >> 37) & MSK(3); + out[12]= (w >> 40) & MSK(3); + out[13]= (w >> 43) & MSK(3); + out[14]= (w >> 46) & MSK(3); + out[15]= (w >> 49) & MSK(3); + out[16]= (w >> 52) & MSK(3); + out[17]= (w >> 55) & MSK(3); + out[18]= (w >> 58) & MSK(3); + out[19]= (w >> 61) & MSK(3); out += 20; + break; + case 5: ip+=8; + out[ 0]= (w >> 4) & MSK(4); + out[ 1]= (w >> 8) & MSK(4); + out[ 2]= (w >> 12) & MSK(4); + out[ 3]= (w >> 16) & MSK(4); + out[ 4]= (w >> 20) & MSK(4); + out[ 5]= (w >> 24) & MSK(4); + out[ 6]= (w >> 28) & MSK(4); + out[ 7]= (w >> 32) & MSK(4); + out[ 8]= (w >> 36) & MSK(4); + out[ 9]= (w >> 40) & MSK(4); + out[10]= (w >> 44) & MSK(4); + out[11]= (w >> 48) & MSK(4); + out[12]= (w >> 52) & MSK(4); + out[13]= (w >> 56) & MSK(4); + out[14]= (w >> 60) & MSK(4); out += 15; + break; + case 6: ip+=8; + out[ 0]= (w >> 4) & MSK(5); + out[ 1]= (w >> 9) & MSK(5); + out[ 2]= (w >> 14) & MSK(5); + out[ 3]= (w >> 19) & MSK(5); + out[ 4]= (w >> 24) & MSK(5); + out[ 5]= (w >> 29) & MSK(5); + out[ 6]= (w >> 34) & MSK(5); + out[ 7]= (w >> 39) & MSK(5); + out[ 8]= (w >> 44) & MSK(5); + out[ 9]= (w >> 49) & MSK(5); + out[10]= (w >> 54) & MSK(5); + out[11]= (w >> 59) & MSK(5); out += 12; + break; + case 7: ip+=8; + out[0]= (w >> 4) & MSK(6); + out[1]= (w >> 10) & MSK(6); + out[2]= (w >> 16) & MSK(6); + out[3]= (w >> 22) & MSK(6); + out[4]= (w >> 28) & MSK(6); + out[5]= (w >> 34) & MSK(6); + out[6]= (w >> 40) & MSK(6); + out[7]= (w >> 46) & MSK(6); + out[8]= (w >> 52) & MSK(6); + out[9]= (w >> 58) & MSK(6); out += 10; + break; + case 8: ip+=8; + out[0]= (w >> 4 ) & MSK(7); + out[1]= (w >> 11) & MSK(7); + out[2]= (w >> 18) & MSK(7); + out[3]= (w >> 25) & MSK(7); + out[4]= (w >> 32) & MSK(7); + out[5]= (w >> 39) & MSK(7); + out[6]= (w >> 46) & MSK(7); + out[7]= (w >> 53) & MSK(7); out += 8; + break; + case 9: ip+=8; + out[0]= (w >> 4 ) & MSK(8); + out[1]= (w >> 12) & MSK(8); + out[2]= (w >> 20) & MSK(8); + out[3]= (w >> 28) & MSK(8); + out[4]= (w >> 36) & MSK(8); + out[5]= (w >> 44) & MSK(8); + out[6]= (w >> 52) & MSK(8); out += 7; + break; + case 10: ip+=8; + out[0]= (w >> 4) & MSK(10); + out[1]= (w >> 14) & MSK(10); + out[2]= (w >> 24) & MSK(10); + out[3]= (w >> 34) & MSK(10); + out[4]= (w >> 44) & MSK(10); + out[5]= (w >> 54) & MSK(10); out += 6; + break; + case 11: ip+=8; + out[0]= (w >> 4) & MSK(12); + out[1]= (w >> 16) & MSK(12); + out[2]= (w >> 28) & MSK(12); + out[3]= (w >> 40) & MSK(12); + out[4]= (w >> 52) & MSK(12); out += 5; + break; + case 12: ip+=8; + out[0]= (w >> 4) & MSK(15); + out[1]= (w >> 19) & MSK(15); + out[2]= (w >> 34) & MSK(15); + out[3]= (w >> 49) & MSK(15); out += 4; + break; + case 13: ip+=8; + out[0]= (w >> 4) & MSK(20); + out[1]= (w >> 24) & MSK(20); + out[2]= (w >> 44) & MSK(20); out += 3; + break; + case 14: ip+=8; + out[0]= (w >> 4) & MSK(30); + out[1]= (w >> 34) & MSK(30); out += 2; + break; + case 15: ip+=8; + out[0]= (w >> 4) & ((1ull<<60)-1); out += 1; + break; + } + } + return ip; +} + diff --git a/aux/simple8b.h b/aux/simple8b.h new file mode 100644 index 0000000..8772124 --- /dev/null +++ b/aux/simple8b.h @@ -0,0 +1,2 @@ +unsigned char *s8benco(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *s8bdeco(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); diff --git a/aux/simple8b.o b/aux/simple8b.o new file mode 100644 index 0000000000000000000000000000000000000000..7de646cd009a13c06f52427db8a9e9f885ea4d7b GIT binary patch literal 8840 zcmb<-^>JfjWMqH=Mg}_u1P><4z|bLwU^{@B4h#YeybO-v9-S8)!yH2$LqdZ+8s9`P zGBBigbjzq}FfcH9bk0%nU}Ru0yxsbZzps&rfx)HM?Z4qk$DRNG|NrmVe1LI+OXn&6 zy(0b1hnPA-cpN;M4>I~#9xQ#?Eeo>4)$pWCw<(C_(ya@kI%8BgAX1$%Dm-wO0GuTP zXGy?WGH{jxoTUP1X~0=JaFzj_Wddhez*#o@@(v8Hy@`yz$v>U=w}sd^c<}F=1ah87 z=T#T}ZA~Dii{&%^zJH7i4Bb3aLHy>QjQlMt7#SEkKkovCuZ!mC*WCQ_43?+(TP`p% zFu3-L2>K>pa^v3?BI4j`d8kgymwzAFB)4vvR*(t4orfGPFY~vZ0;%TT)<7>AdOE zdA0cvqvfsAt*)&H_*-Kb85kUUMIu{H65(%4fUqR^+p-`m8UD5s2up##Z4ndL6czrqH4v5tf7><)ONYPh0EA`0 z-*yVZGU0E#0%2M3x7}l6VCarfv2nCKyNjQJfx&SH9|Hr!P7u?F-~G@gK7RL$j{I(S zJ(4fE@NWYLr)%pa{+8to3=FNeOU!${|9ka{?DyDB9F$>`B5)4!92fq{Wv9ugAW zQXb8RSUfL3c455e%X!1Y@>c0&+IpaLpG)T>xPHgZi>`(TUZ(#4|DRvpg@M5* z`KaY7{$@)?P~LE5^i4kC)Xf8qZ_A7PZAuVfCNyDgh%hsn@D~QKAuMRZcOk;8Xu`)J z!fa^5>mkDIXu>lf!W?MAjSyi@6ydTwmtI#cRH0~>URQ2ZArF^cR~}R$QUQ{6^ zmtI#sR3UzsURQo3q0ZB;(ERZ7$DjZIyJJ)o9CxrYFfi-_5ieK$1&gUT?%)84fryvW zAYvMhJ2*jNAmU{cL`=tV2Ny^TM7+#{h#5HU;0B3-h?h|iF%!ofJRmU;@zM<1`-1i zFCRg~I2?C?QY5$(d3g>Z#^bny38W50yxjHY|9_Xx&#s0C3@^Q0`{)1v7oY?SO_x5s zAu1dm%}*Y9B!6@KU-To?qw^`K`1a}5+2k1F+4;vYDA=R(vPbh94Ij%_#SusU=7Su> z08jtjJSrZYCz>Dd|5rJRWUfc&36Fy>l{}k|aCjVi!P4o?;?aDVL-qgv|Ms2EKhpHz z`YrF4?>_pY+@tx3hEL~%XvZi>j*Io|P5N;-ZGuPV39uQ>hj~EesQ&-Y&(P`o1!BC8 z$H5m8o!)<-qCZ}5L{e`9Qs4Z5zti~$SoI4IFy{n>!vp4=fp7%CoC^?+2$*vP!jS-T zZa_FPV9p%~M*+-v0O6>BIZq%Q4KU{ggrfuIyn%2Gz?=^djtQ6p^$Eza7MiGYf>n1?F%-IJ{sE4}>EC<_JJI!eEXFgd+y# zNI*D}V2%ugBLn6rKsfSX4k&^>4!%$Vb2K0_s$h-|grfoG7(h7MV2%ldqX*_#KsbhA zjtzuk;?aCK04x9v4|6aF8XmSD2VYo$!-Kuk`Nzv^f3b!KGb}vVnjbKC!oq_C%z=gn z7nlPL4<0ZF8XkOL4m3Ojz#M3J2!T1!@DKrWpy43~=0L+k0?dJihZL9t4G$SG2O1u7 zU=B1q6u=y4cqoB6(C|?4Xg*v47J#NuH82O7LN&k~XbRN=bD$|y2h4${P(3gQnnDf0 z9B2wP0&}1#)CA0drcg652bw}Hz#M3dSb;gv7_sv>_`(JpBg~)}arpzPupAgr(kCM< zK3JO{Fv8=54a|YY2RoPpjSmhm2O1xoU=B1sxWF7}d~ky~(D>j1bD;6T3+6!MgAdGs z#s@!`1C0*>Fb5hRf?y6bK7_y=XnY8RInekJ0dpK6iB%NLae;8ez#I<N?R|WGyjTi`D4b1NVx6)5|9DJb;=J&w)8el%Ctpic7 z3Fc3M%WHx8pk@I?UK`Ax1DDqU^B2JRx?uhiIA0IUUjgUqgZXRVd;>6l1DtON=5K-X zjlldJaK15^zX#4Y0rL;Q`KBI?hd(edFo1-ep?S#+%z@@5b1(;*mn^^>XkM}ebD(+2 z3e17#C2KGTnwM7O3p;RLVg%)-A3s2O2^1BeF47H9eb@Xz-s9j4 zCXeqonjf(9w|rw^U^vbUs^*)2F!HwuvobIocV>a`nfP0R!F&z~pP9eK7|iE^@L6E$ z1t5G@{+7=m89z@T{$+&uT_6`6eW zZ#~n|zx7TEAi@DeBpm%)-vmloAQ}|IAQ}|XAo>9V0|ST#^&&yE0waX)0HqV4bOV%L zz{tSR{DZ%g1Elx>xc9CkFgE{ZED3^hnVSDIl{mw>%+3Fq zOHANgmgfH~C8}^PYx95B5-}*Z`2btkLgpJWwt)6^TH(&~zjN3^iFm^*@*kHCYDAMKW0f+`0lQ`tdU0 z+yDQt^1k`71jIaMkaK+HV37 z^Xoz8LroA?{SW3sO)vyCO28(7MSr|p{}qeBH9+PxKd|p~hGuLoFbA5k1;89=#ufu} zppha2=0JU>1m-~fr2*zZ-J}QRKwV_zaqxu+IAeoaxi96hxyJzF9{EmZXm!rn{6M}F zRs`{ZIZ&4hgE>%_N`N^~m&$@UP?su!IZ&6XgE>%_>VP>=ml}dOP?uVJ9DHF0cBwq5 z2rB&o3OPu6ho(~nh&$^$ouPJeH$SNFge6%aFb8U+JB$n?Ye_6S)dlPbb9}J`Q$UmK@JSAh6nb8$|1u`;NoZpXnbHFNZ@6` z@BjbPCh%`N>e+mR>1Fnx|Nr-wgUc>ZK7MKc`~Uy_MPNR-v!oB=7lZkr{>w{w5WfV> z2j$(D{2+cQm=Erl{r~m<|Nb&CAKZz0|LZ>}%Yu41_%RbR3o9Et2PYR7H#ZLt4=)Jt z!2v&B02&3uZ#rRFMg|5@@YpagFkr(L3=9k)j7<(C1}dgu`awf{BA_7^1_lODuNKq; z2KAOfl>;bHK>-O0JWzmv0tpoG3=E9W5a8nB7m>5ECG29lssOnWA7*fOcD7Q`2u&)@ zD=Ael(X-Su)HO6K1#`_bAp#%=gTe(wGgbvLFjfdKO7pODfX3H+7#J92KoSfL3@bqa zr_jsd#HZ22>d0r%#^%Up(ai47H-VXX7ZVqsg(IJVBcFy7pMn#g zgcF~D6CZ~oXmr7afq@|hYMu^g7{Qf?i@`^Ljj4>mU4W0NtU`d7sjR?FfQP9p!5YfSIW*f+>%Shl|0Rhn>S2WDEm?UU6k^Nn#R%UU5kggwBAmic)hD^-5AJN*MG~ z5=#;p^pc8;8NdR1rFl7-*{L~|40>QiMfnf~5Lz#}xWw3qK`%K!H#aq}gh4Maza&-9 z-7i$PxFj(-8>%NYBR;JtF*gb8fQXQ&A=eYz`!7cHV1T)Yp*SG!4=Y1Z!7NgP>y6_Plz`4J!oVzKuNk~p&Ye~`qH%?X1RB_Q`B+nb9+ zyak8&Y#id7aEPD8A^re|_;(!Qe9#1p?tVoa;wCu6-EoLV;SkToA>M#PybDPjIsBJG z%NR6GJrd;(NF0ZrTiYHtRbcmUM>pg{qU z6wEywP;pS(5n0><6bTFr46yJ4xep~D{#xWQ|?ML@NvHD%1`ayAmu!Vsk0%||3-iOJ7+zMlZ zXpnzEK?I^fd>Fm~t=K@rJs>eq_<`IA(+>+jU#R_P(ZuioYCkA`LHc1B#)r|-Q2nrc z22;l%3Sp^2C^!iVKNy_?)elzyWvD{+L%DDYrVl1i3Du7tei~5yu=*aR9_D`-AJjg< z7Jd<+P8kCO1I%Bb@B@{BF#WLb0~HVGX$NHX4XFLiAPy2ncRz@a?sky!1ThfFzyKO0 g1F=BG2dFKCZa=8F#Ad$()PYeT%}5wz9}<@V0HSK3YXATM literal 0 HcmV?d00001 diff --git a/aux/vabyte.h b/aux/vabyte.h new file mode 100644 index 0000000..627318b --- /dev/null +++ b/aux/vabyte.h @@ -0,0 +1,99 @@ +// "variablebyte.h" C Version from https://github.com/lemire/FastPFor +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#define extract7bits(i, val) (val >> (7 * i)) & ((1U << 7) - 1) +#define extract7bitsmaskless(i, val) (val >> (7 * i)) + +unsigned char *vbyteenc(unsigned *in, const size_t length, unsigned *out/*, + size_t &nvalue*/) { + unsigned char *bout = (unsigned char *)(out); + //const unsigned char *const initbout = (unsigned char *)(out); + //unsigned prev = 0; + size_t k; + for (k = 0; k < length; ++k) { + const unsigned val = /*delta ? in[k] - prev :*/ in[k]; + //if (delta) prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = (unsigned char)(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bitsmaskless(1,val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bits(1,val); + ++bout; + *bout = extract7bitsmaskless(2,val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits(0, val); + ++bout; + *bout = extract7bits(1, val); + ++bout; + *bout = extract7bits(2, val); + ++bout; + *bout = extract7bitsmaskless(3, val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bits(1,val); + ++bout; + *bout = extract7bits(2,val); + ++bout; + *bout = extract7bits(3,val); + ++bout; + *bout = extract7bitsmaskless(4,val) | (1U << 7); + ++bout; + } + } + /*while (needPaddingTo32Bits(bout)) { + *bout++ = 0; + } + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4;*/ + return bout; +} + + +unsigned char *vbytedec(const unsigned *in, const size_t length, + unsigned *out/*, size_t &nvalue*/) { + unsigned prev = 0; + if (length == 0) { + //nvalue = 0; + return in;//abort + } + const unsigned char *inbyte = (const unsigned char *)(in); + const unsigned char *const endbyte = (const unsigned char *)(out + + length); + //const unsigned *const initout(out); + + while (endbyte > out) { + unsigned int shift = 0; unsigned v; + for (v = 0; endbyte > out; shift += 7) { + unsigned char c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + *out++ = /*delta ? (prev = v + prev) :*/ v; + break; + } + } + } + //nvalue = out - initout; + //inbyte = padTo32bits(inbyte); + return (const unsigned *)(inbyte); + } + diff --git a/aux/varintg8iu.c b/aux/varintg8iu.c new file mode 100644 index 0000000..29ebfa4 --- /dev/null +++ b/aux/varintg8iu.c @@ -0,0 +1,181 @@ +// C Version of "VarIntG8IU.h" from https://github.com/lemire/FastPFor +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + */ +/** + * + * Implementation of varint-G8IU taken from + * Stepanov et al., SIMD-Based Decoding of Posting Lists, CIKM 2011 + * + * Update: D. Lemire believes that this scheme was patented by Rose, Stepanov et al. (patent 20120221539). + * We wrote this code before the patent was published (August 2012). + * + * By Maxime Caron + * From + * https://github.com/maximecaron/SIMD-Based-Posting-lists + * with minor modifications by D. Lemire. + */ +#ifndef __SSSE3__ +#pragma message "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3" +#else +#ifndef VARINTG8IU_H__ +#define VARINTG8IU_H__ +#include +//#include "codecs.h" +#ifdef __GNUC__ +#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#else +#define PREDICT_FALSE(x) x +#define PREDICT_TRUE(x) x +#endif +#include "varintg8iu.h" + +typedef char v16qi __attribute__ ((vector_size (16))); + +static int maskOutputSize[256]; +static char mask[256][32]; + + int getNumByteNeeded(const uint32_t value) { + if (value > 0x000000FF) { + if (value > 0x0000FFFF) { + if (value > 0x00FFFFFF) { + return 4; + } else { + return 3; + } + } else { + return 2; + } + } else { + return 1; + } + } + + + // For all possible values of the + // descriptor we build a table of any shuffle sequence + // that might be needed at decode time. +void VarIntG8IU() { + for (int desc = 0; desc <= 255; desc++) { + int bitmask = 0x00000001; + int bitindex = 0; + // count number of 0 in the char + int complete = 0; + int ithSize[8]; + int lastpos = -1; + while (bitindex < 8) { + if ((desc & bitmask) == 0) { + ithSize[complete] = bitindex - lastpos; + lastpos = bitindex; + complete++; + } + bitindex++; + bitmask = bitmask << 1; + } + maskOutputSize[desc] = complete; + + int j = 0; + int k = 0; + for (int i = 0; i < complete; i++) { + for (int n = 0; n < 4; n++) { + if (n < ithSize[i]) { + mask[desc][k] = j; + j = j + 1; + } else { + mask[desc][k] = -1; + } + k = k + 1; + } + } + + } + + } + +unsigned char *vintg8enc(const uint32_t *__restrict__ in, const size_t length, unsigned char *__restrict__ out) { + const uint32_t *in_ = in + length; //size_t srclength = length * 4;unsigned char* dest = (unsigned char*)(out);size_t dstlength = length * 4; + //size_t compressed_size = 0; + while(in < in_ /*srclength > 0 && dstlength >= 9*/) { //compressed_size += encodeBlock(in, srclength, dst, nvalue); + unsigned char desc = 0xFF; + unsigned char bitmask = 0x01; + uint32_t buffer[8]; + int ithSize[8]; + int length = 0; + int numInt = 0; + + while (in < in_ /*srclength > 0*/) { + const uint32_t* temp = in; + int byteNeeded = getNumByteNeeded(*temp); + + if (PREDICT_FALSE(length + byteNeeded > 8)) { + break; + } + + //flip the correct bit in desc + bitmask = bitmask << (byteNeeded - 1); + desc = desc ^ bitmask; + bitmask = bitmask << 1; + + ithSize[numInt] = byteNeeded; + length += byteNeeded; + buffer[numInt] = *temp; + ++in;// = in + 1; + //srclength -= 4; + numInt++; + } + out[0] = desc; + int written = 1; + for(int i = 0; i < numInt; i++) { + int size = ithSize[i]; + uint32_t value = buffer[i]; + for (int j = 0; j < size; j++) { + out[written++] = value >> (j * 8); + } + } + out += 9; //dstlength -= 9; //compressed_size += 9; + } + //Ouput might not be a multiple of 4 so we make it so + return out; //out + ((compressed_size + 3)/ 4); + } + +unsigned char *vintg8dec(const unsigned char *__restrict__ in, const size_t length, uint32_t *__restrict__ out) { + size_t srclength = length * 4; + const unsigned *out_ = out + length; //uint32_t * dest = out;size_t nvalue = length * 4; //uint32_t uncompressSize = 0; + while (out < out_ /*srclength >= 9*/) { //uncompressSize += decodeBlock(in, srclength, dst/*, nvalue*/); + const unsigned char* pdesc = in++; + unsigned char desc = *pdesc; + srclength -= 1; + + const unsigned char* peek = in; + v16qi data; + if (PREDICT_TRUE(srclength >= 16)) { + // read 16 byte of data only if we need to avoid cache miss + data = __builtin_ia32_lddqu((const char*) (peek)); + } else { + static char buff[16]; + memcpy(buff, peek, 8); + data = __builtin_ia32_lddqu(buff); + } + // load de required mask + v16qi shf = __builtin_ia32_lddqu(mask[desc]); + v16qi result = __builtin_ia32_pshufb128(data, shf); + char* dst = (char*) (out); + __builtin_ia32_storedqu(dst, result); + int readSize = maskOutputSize[desc]; + + if (PREDICT_TRUE( readSize >= 4)) { + v16qi shf2 = __builtin_ia32_lddqu(mask[desc] + 16); + v16qi result2 = __builtin_ia32_pshufb128(data, shf2); + __builtin_ia32_storedqu(dst + (16), result2); + } + // pop 8 input char + in += 8; srclength -= 8; out += readSize; //dstlength -= readSize * 4;// uncompressSize += readSize; + } + return in; //(uint32_t *) (((uintptr_t) (src) + 3) & ~3); + +} + +#endif //__SSE3__ +#endif diff --git a/aux/varintg8iu.h b/aux/varintg8iu.h new file mode 100644 index 0000000..a2659d4 --- /dev/null +++ b/aux/varintg8iu.h @@ -0,0 +1,5 @@ +#include +void VarIntG8IU(); +unsigned char *vintg8enc(const uint32_t *__restrict__ in, const size_t length, unsigned char *__restrict__ out); +unsigned char *vintg8dec(const unsigned char *__restrict__ in, const size_t length, uint32_t *__restrict__ out); + diff --git a/aux/vas16c.h b/aux/vas16c.h new file mode 100644 index 0000000..15e671d --- /dev/null +++ b/aux/vas16c.h @@ -0,0 +1,35 @@ +// optimized version from: http://jinruhe.com/ +static int s16_cnum[16] = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1}; +static int s16_cbits[16][28] = { + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0}, + {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} }; + +#define S16ENC(__w, __p, m) { unsigned *_p = __p, *_w = __w; \ + unsigned int _k, _j, _m, _o; \ + for (_k = 0; _k < 16; _k++) { \ + *_w = _k<<28; \ + _m = (s16_cnum[_k] < m)? s16_cnum[_k]:m; \ + for (_j = 0, _o = 0; (_j < _m) && (*(_p+_j) < (1<>28) {\ + case 0:\ + _p[ 0] = (_rw ) & 1;\ + _p[ 1] = (_rw>> 1) & 1;\ + _p[ 2] = (_rw>> 2) & 1;\ + _p[ 3] = (_rw>> 3) & 1;\ + _p[ 4] = (_rw>> 4) & 1;\ + _p[ 5] = (_rw>> 5) & 1;\ + _p[ 6] = (_rw>> 6) & 1;\ + _p[ 7] = (_rw>> 7) & 1;\ + _p[ 8] = (_rw>> 8) & 1;\ + _p[ 9] = (_rw>> 9) & 1;\ + _p[10] = (_rw>>10) & 1;\ + _p[11] = (_rw>>11) & 1;\ + _p[12] = (_rw>>12) & 1;\ + _p[13] = (_rw>>13) & 1;\ + _p[14] = (_rw>>14) & 1;\ + _p[15] = (_rw>>15) & 1;\ + _p[16] = (_rw>>16) & 1;\ + _p[17] = (_rw>>17) & 1;\ + _p[18] = (_rw>>18) & 1;\ + _p[19] = (_rw>>19) & 1;\ + _p[20] = (_rw>>20) & 1;\ + _p[21] = (_rw>>21) & 1;\ + _p[22] = (_rw>>22) & 1;\ + _p[23] = (_rw>>23) & 1;\ + _p[24] = (_rw>>24) & 1;\ + _p[25] = (_rw>>25) & 1;\ + _p[26] = (_rw>>26) & 1;\ + _p[27] = (_rw>>27) & 1; _p += 28;\ + break;\ + case 1: \ + _p[ 0] = (_rw ) & 3;\ + _p[ 1] = (_rw>> 2) & 3;\ + _p[ 2] = (_rw>> 4) & 3;\ + _p[ 3] = (_rw>> 6) & 3;\ + _p[ 4] = (_rw>> 8) & 3;\ + _p[ 5] = (_rw>>10) & 3;\ + _p[ 6] = (_rw>>12) & 3;\ + _p[ 7] = (_rw>>14) & 1;\ + _p[ 8] = (_rw>>15) & 1;\ + _p[ 9] = (_rw>>16) & 1;\ + _p[10] = (_rw>>17) & 1;\ + _p[11] = (_rw>>18) & 1;\ + _p[12] = (_rw>>19) & 1;\ + _p[13] = (_rw>>20) & 1;\ + _p[14] = (_rw>>21) & 1;\ + _p[15] = (_rw>>22) & 1;\ + _p[16] = (_rw>>23) & 1;\ + _p[17] = (_rw>>24) & 1;\ + _p[18] = (_rw>>25) & 1;\ + _p[19] = (_rw>>26) & 1;\ + _p[20] = (_rw>>27) & 1; _p += 21; \ + break; \ + case 2: \ + _p[0] = (_rw) & 1; \ + _p[1] = (_rw>>1) & 1;\ + _p[2] = (_rw>>2) & 1;\ + _p[3] = (_rw>>3) & 1;\ + _p[4] = (_rw>>4) & 1;\ + _p[5] = (_rw>>5) & 1;\ + _p[6] = (_rw>>6) & 1;\ + _p[7] = (_rw>>7) & 3;\ + _p[8] = (_rw>>9) & 3;\ + _p[9] = (_rw>>11) & 3;\ + _p[10] = (_rw>>13) & 3;\ + _p[11] = (_rw>>15) & 3;\ + _p[12] = (_rw>>17) & 3;\ + _p[13] = (_rw>>19) & 3;\ + _p[14] = (_rw>>21) & 1;\ + _p[15] = (_rw>>22) & 1;\ + _p[16] = (_rw>>23) & 1;\ + _p[17] = (_rw>>24) & 1;\ + _p[18] = (_rw>>25) & 1;\ + _p[19] = (_rw>>26) & 1;\ + _p[20] = (_rw>>27) & 1; _p += 21;\ + break; \ + case 3: \ + _p[0] = (_rw) & 1; \ + _p[1] = (_rw>>1) & 1;\ + _p[2] = (_rw>>2) & 1;\ + _p[3] = (_rw>>3) & 1;\ + _p[4] = (_rw>>4) & 1;\ + _p[5] = (_rw>>5) & 1;\ + _p[6] = (_rw>>6) & 1;\ + _p[7] = (_rw>>7) & 1;\ + _p[8] = (_rw>>8) & 1;\ + _p[9] = (_rw>>9) & 1;\ + _p[10] = (_rw>>10) & 1;\ + _p[11] = (_rw>>11) & 1;\ + _p[12] = (_rw>>12) & 1;\ + _p[13] = (_rw>>13) & 1;\ + _p[14] = (_rw>>14) & 3;\ + _p[15] = (_rw>>16) & 3;\ + _p[16] = (_rw>>18) & 3;\ + _p[17] = (_rw>>20) & 3;\ + _p[18] = (_rw>>22) & 3;\ + _p[19] = (_rw>>24) & 3;\ + _p[20] = (_rw>>26) & 3; _p += 21;\ + break; \ + case 4: \ + _p[ 0] = (_rw ) & 3;\ + _p[ 1] = (_rw>> 2) & 3;\ + _p[ 2] = (_rw>> 4) & 3;\ + _p[ 3] = (_rw>> 6) & 3;\ + _p[ 4] = (_rw>> 8) & 3;\ + _p[ 5] = (_rw>>10) & 3;\ + _p[ 6] = (_rw>>12) & 3;\ + _p[ 7] = (_rw>>14) & 3;\ + _p[ 8] = (_rw>>16) & 3;\ + _p[ 9] = (_rw>>18) & 3;\ + _p[10] = (_rw>>20) & 3;\ + _p[11] = (_rw>>22) & 3;\ + _p[12] = (_rw>>24) & 3;\ + _p[13] = (_rw>>26) & 3; _p += 14;\ + break; \ + case 5: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 7;\ + _p[2] = (_rw>>7) & 7;\ + _p[3] = (_rw>>10) & 7;\ + _p[4] = (_rw>>13) & 7;\ + _p[5] = (_rw>>16) & 7;\ + _p[6] = (_rw>>19) & 7;\ + _p[7] = (_rw>>22) & 7;\ + _p[8] = (_rw>>25) & 7; _p += 9;\ + break; \ + case 6: \ + _p[0] = (_rw) & 7; \ + _p[1] = (_rw>>3) & 15;\ + _p[2] = (_rw>>7) & 15;\ + _p[3] = (_rw>>11) & 15;\ + _p[4] = (_rw>>15) & 15;\ + _p[5] = (_rw>>19) & 7;\ + _p[6] = (_rw>>22) & 7;\ + _p[7] = (_rw>>25) & 7; _p += 8;\ + break; \ + case 7: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 15;\ + _p[2] = (_rw>>8) & 15;\ + _p[3] = (_rw>>12) & 15;\ + _p[4] = (_rw>>16) & 15;\ + _p[5] = (_rw>>20) & 15;\ + _p[6] = (_rw>>24) & 15; _p += 7;\ + break; \ + case 8: \ + _p[0] = (_rw ) & 31;\ + _p[1] = (_rw>> 5) & 31;\ + _p[2] = (_rw>>10) & 31;\ + _p[3] = (_rw>>15) & 31;\ + _p[4] = (_rw>>20) & 15;\ + _p[5] = (_rw>>24) & 15; _p += 6;\ + break; \ + case 9: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 15;\ + _p[2] = (_rw>>8) & 31;\ + _p[3] = (_rw>>13) & 31;\ + _p[4] = (_rw>>18) & 31;\ + _p[5] = (_rw>>23) & 31; _p += 6;\ + break; \ + case 10: \ + _p[0] = (_rw) & 63; \ + _p[1] = (_rw>>6) & 63;\ + _p[2] = (_rw>>12) & 63;\ + _p[3] = (_rw>>18) & 31;\ + _p[4] = (_rw>>23) & 31; _p += 5;\ + break; \ + case 11: \ + _p[0] = (_rw) & 31; \ + _p[1] = (_rw>>5) & 31;\ + _p[2] = (_rw>>10) & 63;\ + _p[3] = (_rw>>16) & 63;\ + _p[4] = (_rw>>22) & 63; _p += 5;\ + break; \ + case 12: \ + _p[0] = (_rw) & 127; \ + _p[1] = (_rw>>7) & 127;\ + _p[2] = (_rw>>14) & 127;\ + _p[3] = (_rw>>21) & 127; _p += 4;\ + break; \ + case 13: \ + _p[0] = (_rw) & 1023; \ + _p[1] = (_rw>>10) & 511;\ + _p[2] = (_rw>>19) & 511; _p += 3;\ + break; \ + case 14: \ + _p[0] = (_rw) & 16383; \ + _p[1] = (_rw>>14) & 16383; _p += 2;\ + break; \ + case 15: \ + _p[0] = (_rw) & ((1<<28)-1); _p++; \ + break; \ + } \ +} + +#if 0 +#define BREAK _rw = *_in++; goto *_lab[__out<_oute?((_rw)>>28):16] + +#define s16dec(__in, __n, __pout) ({\ + __label__ _lab0,_lab1,_lab2,_lab3,_lab4,_lab5,_lab6,_lab7,_lab8,_lab9,_lab10,_lab11,_lab12,_lab13,_lab14,_lab15,_labend;\ + static void *_lab[] = { &&_lab0, &&_lab1, &&_lab2, &&_lab3, &&_lab4, &&_lab5, &&_lab6, &&_lab7, &&_lab8, &&_lab9, &&_lab10, &&_lab11, &&_lab12, &&_lab13, &&_lab14, &&_lab15, &&_labend };\ + unsigned *_in = __in; typeof(__pout[0]) *__out = __pout, *_oute = __out+(__n); register unsigned _rw = *_in++; goto *_lab[(_rw)>>28];\ + _lab0:\ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 1; \ + __out[8] = (_rw>>8) & 1; \ + __out[9] = (_rw>>9) & 1; \ + __out[10] = (_rw>>10) & 1; \ + __out[11] = (_rw>>11) & 1; \ + __out[12] = (_rw>>12) & 1; \ + __out[13] = (_rw>>13) & 1; \ + __out[14] = (_rw>>14) & 1; \ + __out[15] = (_rw>>15) & 1; \ + __out[16] = (_rw>>16) & 1; \ + __out[17] = (_rw>>17) & 1; \ + __out[18] = (_rw>>18) & 1; \ + __out[19] = (_rw>>19) & 1; \ + __out[20] = (_rw>>20) & 1; \ + __out[21] = (_rw>>21) & 1; \ + __out[22] = (_rw>>22) & 1; \ + __out[23] = (_rw>>23) & 1; \ + __out[24] = (_rw>>24) & 1; \ + __out[25] = (_rw>>25) & 1; \ + __out[26] = (_rw>>26) & 1; \ + __out[27] = (_rw>>27) & 1; __out += 28;\ + BREAK; \ + _lab1: \ + __out[0] = (_rw) & 3; \ + __out[1] = (_rw>>2) & 3; \ + __out[2] = (_rw>>4) & 3; \ + __out[3] = (_rw>>6) & 3; \ + __out[4] = (_rw>>8) & 3; \ + __out[5] = (_rw>>10) & 3; \ + __out[6] = (_rw>>12) & 3; \ + __out[7] = (_rw>>14) & 1; \ + __out[8] = (_rw>>15) & 1; \ + __out[9] = (_rw>>16) & 1; \ + __out[10] = (_rw>>17) & 1; \ + __out[11] = (_rw>>18) & 1; \ + __out[12] = (_rw>>19) & 1; \ + __out[13] = (_rw>>20) & 1; \ + __out[14] = (_rw>>21) & 1; \ + __out[15] = (_rw>>22) & 1; \ + __out[16] = (_rw>>23) & 1; \ + __out[17] = (_rw>>24) & 1; \ + __out[18] = (_rw>>25) & 1; \ + __out[19] = (_rw>>26) & 1; \ + __out[20] = (_rw>>27) & 1; __out += 21; \ + BREAK; \ + _lab2: \ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 3; \ + __out[8] = (_rw>>9) & 3; \ + __out[9] = (_rw>>11) & 3; \ + __out[10] = (_rw>>13) & 3; \ + __out[11] = (_rw>>15) & 3; \ + __out[12] = (_rw>>17) & 3; \ + __out[13] = (_rw>>19) & 3; \ + __out[14] = (_rw>>21) & 1; \ + __out[15] = (_rw>>22) & 1; \ + __out[16] = (_rw>>23) & 1; \ + __out[17] = (_rw>>24) & 1; \ + __out[18] = (_rw>>25) & 1; \ + __out[19] = (_rw>>26) & 1; \ + __out[20] = (_rw>>27) & 1; __out += 21;\ + BREAK; \ + _lab3: \ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 1; \ + __out[8] = (_rw>>8) & 1; \ + __out[9] = (_rw>>9) & 1; \ + __out[10] = (_rw>>10) & 1; \ + __out[11] = (_rw>>11) & 1; \ + __out[12] = (_rw>>12) & 1; \ + __out[13] = (_rw>>13) & 1; \ + __out[14] = (_rw>>14) & 3; \ + __out[15] = (_rw>>16) & 3; \ + __out[16] = (_rw>>18) & 3; \ + __out[17] = (_rw>>20) & 3; \ + __out[18] = (_rw>>22) & 3; \ + __out[19] = (_rw>>24) & 3; \ + __out[20] = (_rw>>26) & 3; __out += 21;\ + BREAK; \ + _lab4: \ + __out[0] = (_rw) & 3; \ + __out[1] = (_rw>>2) & 3; \ + __out[2] = (_rw>>4) & 3; \ + __out[3] = (_rw>>6) & 3; \ + __out[4] = (_rw>>8) & 3; \ + __out[5] = (_rw>>10) & 3; \ + __out[6] = (_rw>>12) & 3; \ + __out[7] = (_rw>>14) & 3; \ + __out[8] = (_rw>>16) & 3; \ + __out[9] = (_rw>>18) & 3; \ + __out[10] = (_rw>>20) & 3; \ + __out[11] = (_rw>>22) & 3; \ + __out[12] = (_rw>>24) & 3; \ + __out[13] = (_rw>>26) & 3; __out += 14;\ + BREAK; \ + _lab5: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 7; \ + __out[2] = (_rw>>7) & 7; \ + __out[3] = (_rw>>10) & 7; \ + __out[4] = (_rw>>13) & 7; \ + __out[5] = (_rw>>16) & 7; \ + __out[6] = (_rw>>19) & 7; \ + __out[7] = (_rw>>22) & 7; \ + __out[8] = (_rw>>25) & 7; __out += 9;\ + BREAK; \ + _lab6: \ + __out[0] = (_rw) & 7; \ + __out[1] = (_rw>>3) & 15; \ + __out[2] = (_rw>>7) & 15; \ + __out[3] = (_rw>>11) & 15; \ + __out[4] = (_rw>>15) & 15; \ + __out[5] = (_rw>>19) & 7; \ + __out[6] = (_rw>>22) & 7; \ + __out[7] = (_rw>>25) & 7; __out += 8;\ + BREAK; \ + _lab7: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 15; \ + __out[2] = (_rw>>8) & 15; \ + __out[3] = (_rw>>12) & 15; \ + __out[4] = (_rw>>16) & 15; \ + __out[5] = (_rw>>20) & 15; \ + __out[6] = (_rw>>24) & 15; __out += 7;\ + BREAK; \ + _lab8: \ + __out[0] = (_rw) & 31; \ + __out[1] = (_rw>>5) & 31; \ + __out[2] = (_rw>>10) & 31; \ + __out[3] = (_rw>>15) & 31; \ + __out[4] = (_rw>>20) & 15; \ + __out[5] = (_rw>>24) & 15; __out += 6;\ + BREAK; \ + _lab9: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 15; \ + __out[2] = (_rw>>8) & 31; \ + __out[3] = (_rw>>13) & 31; \ + __out[4] = (_rw>>18) & 31; \ + __out[5] = (_rw>>23) & 31; __out += 6;\ + BREAK; \ + _lab10: \ + __out[0] = (_rw) & 63; \ + __out[1] = (_rw>>6) & 63; \ + __out[2] = (_rw>>12) & 63; \ + __out[3] = (_rw>>18) & 31; \ + __out[4] = (_rw>>23) & 31; __out += 5;\ + BREAK; \ + _lab11: \ + __out[0] = (_rw) & 31; \ + __out[1] = (_rw>>5) & 31; \ + __out[2] = (_rw>>10) & 63; \ + __out[3] = (_rw>>16) & 63; \ + __out[4] = (_rw>>22) & 63; __out += 5;\ + BREAK; \ + _lab12: \ + __out[0] = (_rw) & 127; \ + __out[1] = (_rw>>7) & 127; \ + __out[2] = (_rw>>14) & 127; \ + __out[3] = (_rw>>21) & 127; __out += 4;\ + BREAK; \ + _lab13: \ + __out[0] = (_rw) & 1023; \ + __out[1] = (_rw>>10) & 511; \ + __out[2] = (_rw>>19) & 511; __out += 3;\ + BREAK; \ + _lab14:\ + __out[0] = (_rw) & 16383; \ + __out[1] = (_rw>>14) & 16383; __out += 2;\ + BREAK; \ + _lab15:\ + __out[0] = (_rw) & ((1<<28)-1); __out++; \ + BREAK;\ + _labend:;(_in-1);\ +}) +#endif diff --git a/aux/vbyte_poly.h b/aux/vbyte_poly.h new file mode 100644 index 0000000..3c2668d --- /dev/null +++ b/aux/vbyte_poly.h @@ -0,0 +1,46 @@ +// +#define VBYTE_ENC(_v, _n) \ +{\ + unsigned _num; \ + unsigned char _barray[5]; \ + unsigned _i, _started = 0; \ + _num = _n; \ + for (_i = 0; _i < 5; _i++) \ + { \ + _barray[_i] = ((_num%128)<<1); \ + _num = _num/128; \ + } \ + for (_i = 4; _i > 0; _i--) \ + { \ + if ((_barray[_i] != 0) || (_started == 1)) \ + { \ + _started = 1; \ + *_v = _barray[_i]|0x1; \ + _v++; \ + } \ + } \ + *_v = _barray[0]|0x0; \ + _v++; \ +} + +#define VBYTE_DEC(_v, _n) \ +{\ + _n = ((*_v>>1)); \ + if ((*_v&0x1) != 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + if ((*_v&0x1)!= 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + if ((*_v&0x1) != 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + }\ + }\ + }\ + _v++; \ +} + diff --git a/bitpack.c b/bitpack.c new file mode 100644 index 0000000..e364984 --- /dev/null +++ b/bitpack.c @@ -0,0 +1,34 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack.c - "Integer Compression" binary packing +**/ +#include "bitpack_.h" +#include "bitpack.h" +#define IPPB( __ip,__x, __parm) + +#define PAD8(__x) ( (((__x)+8-1)/8) ) + +unsigned char *bitpack32(unsigned *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } +unsigned char *bitpack16(unsigned short *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } + diff --git a/bitpack.h b/bitpack.h new file mode 100644 index 0000000..77dee67 --- /dev/null +++ b/bitpack.h @@ -0,0 +1,30 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack.c - "Integer Compression" binary packing +**/ + +unsigned char *bitpack16( unsigned short *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out); +unsigned char *bitpack32( unsigned *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out); + + diff --git a/bitpack64_.h b/bitpack64_.h new file mode 100644 index 0000000..d74b27c --- /dev/null +++ b/bitpack64_.h @@ -0,0 +1,1136 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack64_.h - "Integer Compression" binary packing +**/ + +#define BITBLK32_1(ip, i, op, parm) { ; register uint32_t w;;\ + IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 1;\ + IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 2;\ + IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 3;\ + IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 4;\ + IPPB(ip, i*32+ 5, parm); w |= (uint32_t)SRC(ip, i*32+ 5) << 5;\ + IPPB(ip, i*32+ 6, parm); w |= (uint32_t)SRC(ip, i*32+ 6) << 6;\ + IPPB(ip, i*32+ 7, parm); w |= (uint32_t)SRC(ip, i*32+ 7) << 7;\ + IPPB(ip, i*32+ 8, parm); w |= (uint32_t)SRC(ip, i*32+ 8) << 8;\ + IPPB(ip, i*32+ 9, parm); w |= (uint32_t)SRC(ip, i*32+ 9) << 9;\ + IPPB(ip, i*32+10, parm); w |= (uint32_t)SRC(ip, i*32+10) << 10;\ + IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 11;\ + IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 12;\ + IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 13;\ + IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 14;\ + IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 15;\ + IPPB(ip, i*32+16, parm); w |= (uint32_t)SRC(ip, i*32+16) << 16;\ + IPPB(ip, i*32+17, parm); w |= (uint32_t)SRC(ip, i*32+17) << 17;\ + IPPB(ip, i*32+18, parm); w |= (uint32_t)SRC(ip, i*32+18) << 18;\ + IPPB(ip, i*32+19, parm); w |= (uint32_t)SRC(ip, i*32+19) << 19;\ + IPPB(ip, i*32+20, parm); w |= (uint32_t)SRC(ip, i*32+20) << 20;\ + IPPB(ip, i*32+21, parm); w |= (uint32_t)SRC(ip, i*32+21) << 21;\ + IPPB(ip, i*32+22, parm); w |= (uint32_t)SRC(ip, i*32+22) << 22;\ + IPPB(ip, i*32+23, parm); w |= (uint32_t)SRC(ip, i*32+23) << 23;\ + IPPB(ip, i*32+24, parm); w |= (uint32_t)SRC(ip, i*32+24) << 24;\ + IPPB(ip, i*32+25, parm); w |= (uint32_t)SRC(ip, i*32+25) << 25;\ + IPPB(ip, i*32+26, parm); w |= (uint32_t)SRC(ip, i*32+26) << 26;\ + IPPB(ip, i*32+27, parm); w |= (uint32_t)SRC(ip, i*32+27) << 27;\ + IPPB(ip, i*32+28, parm); w |= (uint32_t)SRC(ip, i*32+28) << 28;\ + IPPB(ip, i*32+29, parm); w |= (uint32_t)SRC(ip, i*32+29) << 29;\ + IPPB(ip, i*32+30, parm); w |= (uint32_t)SRC(ip, i*32+30) << 30;\ + IPPB(ip, i*32+31, parm); w |= (uint32_t)SRC(ip, i*32+31) << 31;*((uint32_t *)op+i*1+ 0) = w;;\ +} + +#define BITPACK64_1(ip, op, parm) { \ + BITBLK32_1(ip, 0, op, parm); SRCI(ip); op += 1*4/sizeof(op[0]);\ +} + +#define BITBLK64_2(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 2;\ + IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 4;\ + IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 6;\ + IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 8;\ + IPPB(ip, i*32+ 5, parm); w |= (uint32_t)SRC(ip, i*32+ 5) << 10;\ + IPPB(ip, i*32+ 6, parm); w |= (uint32_t)SRC(ip, i*32+ 6) << 12;\ + IPPB(ip, i*32+ 7, parm); w |= (uint32_t)SRC(ip, i*32+ 7) << 14;\ + IPPB(ip, i*32+ 8, parm); w |= (uint32_t)SRC(ip, i*32+ 8) << 16;\ + IPPB(ip, i*32+ 9, parm); w |= (uint32_t)SRC(ip, i*32+ 9) << 18;\ + IPPB(ip, i*32+10, parm); w |= (uint32_t)SRC(ip, i*32+10) << 20;\ + IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 22;\ + IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 24;\ + IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 26;\ + IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 28;\ + IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 30;\ + IPPB(ip, i*32+16, parm); w |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); w |= (uint64_t)SRC(ip, i*32+17) << 34;\ + IPPB(ip, i*32+18, parm); w |= (uint64_t)SRC(ip, i*32+18) << 36;\ + IPPB(ip, i*32+19, parm); w |= (uint64_t)SRC(ip, i*32+19) << 38;\ + IPPB(ip, i*32+20, parm); w |= (uint64_t)SRC(ip, i*32+20) << 40;\ + IPPB(ip, i*32+21, parm); w |= (uint64_t)SRC(ip, i*32+21) << 42;\ + IPPB(ip, i*32+22, parm); w |= (uint64_t)SRC(ip, i*32+22) << 44;\ + IPPB(ip, i*32+23, parm); w |= (uint64_t)SRC(ip, i*32+23) << 46;\ + IPPB(ip, i*32+24, parm); w |= (uint64_t)SRC(ip, i*32+24) << 48;\ + IPPB(ip, i*32+25, parm); w |= (uint64_t)SRC(ip, i*32+25) << 50;\ + IPPB(ip, i*32+26, parm); w |= (uint64_t)SRC(ip, i*32+26) << 52;\ + IPPB(ip, i*32+27, parm); w |= (uint64_t)SRC(ip, i*32+27) << 54;\ + IPPB(ip, i*32+28, parm); w |= (uint64_t)SRC(ip, i*32+28) << 56;\ + IPPB(ip, i*32+29, parm); w |= (uint64_t)SRC(ip, i*32+29) << 58;\ + IPPB(ip, i*32+30, parm); w |= (uint64_t)SRC(ip, i*32+30) << 60;\ + IPPB(ip, i*32+31, parm); w |= (uint64_t)SRC(ip, i*32+31) << 62;*((uint64_t *)op+i*1+ 0) = w;;\ +} + +#define BITPACK64_2(ip, op, parm) { \ + BITBLK64_2(ip, 0, op, parm); SRCI(ip); op += 2*4/sizeof(op[0]);\ +} + +#define BITBLK64_3(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*64+ 0, parm); w = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); w |= (uint32_t)SRC(ip, i*64+ 1) << 3;\ + IPPB(ip, i*64+ 2, parm); w |= (uint32_t)SRC(ip, i*64+ 2) << 6;\ + IPPB(ip, i*64+ 3, parm); w |= (uint32_t)SRC(ip, i*64+ 3) << 9;\ + IPPB(ip, i*64+ 4, parm); w |= (uint32_t)SRC(ip, i*64+ 4) << 12;\ + IPPB(ip, i*64+ 5, parm); w |= (uint32_t)SRC(ip, i*64+ 5) << 15;\ + IPPB(ip, i*64+ 6, parm); w |= (uint32_t)SRC(ip, i*64+ 6) << 18;\ + IPPB(ip, i*64+ 7, parm); w |= (uint32_t)SRC(ip, i*64+ 7) << 21;\ + IPPB(ip, i*64+ 8, parm); w |= (uint32_t)SRC(ip, i*64+ 8) << 24;\ + IPPB(ip, i*64+ 9, parm); w |= (uint32_t)SRC(ip, i*64+ 9) << 27;\ + IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 30;\ + IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 33;\ + IPPB(ip, i*64+12, parm); w |= (uint64_t)SRC(ip, i*64+12) << 36;\ + IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 39;\ + IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 42;\ + IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 45;\ + IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 48;\ + IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 51;\ + IPPB(ip, i*64+18, parm); w |= (uint64_t)SRC(ip, i*64+18) << 54;\ + IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 57;\ + IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 60 | (uint64_t)SRC1(ip, i*64+21) << 63;*((uint64_t *)op+i*3+ 0) = w;\ + IPPB(ip, i*64+21, parm); w = (uint32_t)SRC(ip, i*64+21) >> 1;\ + IPPB(ip, i*64+22, parm); w |= (uint32_t)SRC(ip, i*64+22) << 2;\ + IPPB(ip, i*64+23, parm); w |= (uint32_t)SRC(ip, i*64+23) << 5;\ + IPPB(ip, i*64+24, parm); w |= (uint32_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); w |= (uint32_t)SRC(ip, i*64+25) << 11;\ + IPPB(ip, i*64+26, parm); w |= (uint32_t)SRC(ip, i*64+26) << 14;\ + IPPB(ip, i*64+27, parm); w |= (uint32_t)SRC(ip, i*64+27) << 17;\ + IPPB(ip, i*64+28, parm); w |= (uint32_t)SRC(ip, i*64+28) << 20;\ + IPPB(ip, i*64+29, parm); w |= (uint32_t)SRC(ip, i*64+29) << 23;\ + IPPB(ip, i*64+30, parm); w |= (uint32_t)SRC(ip, i*64+30) << 26;\ + IPPB(ip, i*64+31, parm); w |= (uint32_t)SRC(ip, i*64+31) << 29;*((uint64_t *)op+i*3+ 1) = w;;\ +} + +#define BITPACK64_3(ip, op, parm) { \ + BITBLK64_3(ip, 0, op, parm); SRCI(ip); op += 3*4/sizeof(op[0]);\ +} + +#define BITBLK64_4(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*16+ 0, parm); w = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); w |= (uint32_t)SRC(ip, i*16+ 1) << 4;\ + IPPB(ip, i*16+ 2, parm); w |= (uint32_t)SRC(ip, i*16+ 2) << 8;\ + IPPB(ip, i*16+ 3, parm); w |= (uint32_t)SRC(ip, i*16+ 3) << 12;\ + IPPB(ip, i*16+ 4, parm); w |= (uint32_t)SRC(ip, i*16+ 4) << 16;\ + IPPB(ip, i*16+ 5, parm); w |= (uint32_t)SRC(ip, i*16+ 5) << 20;\ + IPPB(ip, i*16+ 6, parm); w |= (uint32_t)SRC(ip, i*16+ 6) << 24;\ + IPPB(ip, i*16+ 7, parm); w |= (uint32_t)SRC(ip, i*16+ 7) << 28;\ + IPPB(ip, i*16+ 8, parm); w |= (uint64_t)SRC(ip, i*16+ 8) << 32;\ + IPPB(ip, i*16+ 9, parm); w |= (uint64_t)SRC(ip, i*16+ 9) << 36;\ + IPPB(ip, i*16+10, parm); w |= (uint64_t)SRC(ip, i*16+10) << 40;\ + IPPB(ip, i*16+11, parm); w |= (uint64_t)SRC(ip, i*16+11) << 44;\ + IPPB(ip, i*16+12, parm); w |= (uint64_t)SRC(ip, i*16+12) << 48;\ + IPPB(ip, i*16+13, parm); w |= (uint64_t)SRC(ip, i*16+13) << 52;\ + IPPB(ip, i*16+14, parm); w |= (uint64_t)SRC(ip, i*16+14) << 56;\ + IPPB(ip, i*16+15, parm); w |= (uint64_t)SRC(ip, i*16+15) << 60;*((uint64_t *)op+i*1+ 0) = w;;\ +} + +#define BITPACK64_4(ip, op, parm) { \ + BITBLK64_4(ip, 0, op, parm);\ + BITBLK64_4(ip, 1, op, parm); SRCI(ip); op += 4*4/sizeof(op[0]);\ +} + +#define BITBLK64_5(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*64+ 0, parm); w = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); w |= (uint32_t)SRC(ip, i*64+ 1) << 5;\ + IPPB(ip, i*64+ 2, parm); w |= (uint32_t)SRC(ip, i*64+ 2) << 10;\ + IPPB(ip, i*64+ 3, parm); w |= (uint32_t)SRC(ip, i*64+ 3) << 15;\ + IPPB(ip, i*64+ 4, parm); w |= (uint32_t)SRC(ip, i*64+ 4) << 20;\ + IPPB(ip, i*64+ 5, parm); w |= (uint32_t)SRC(ip, i*64+ 5) << 25;\ + IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 30;\ + IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 35;\ + IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 40;\ + IPPB(ip, i*64+ 9, parm); w |= (uint64_t)SRC(ip, i*64+ 9) << 45;\ + IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 50;\ + IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 55 | (uint64_t)SRC1(ip, i*64+12) << 60;*((uint64_t *)op+i*5+ 0) = w;\ + IPPB(ip, i*64+12, parm); w = (uint32_t)SRC(ip, i*64+12) >> 4;\ + IPPB(ip, i*64+13, parm); w |= (uint32_t)SRC(ip, i*64+13) << 1;\ + IPPB(ip, i*64+14, parm); w |= (uint32_t)SRC(ip, i*64+14) << 6;\ + IPPB(ip, i*64+15, parm); w |= (uint32_t)SRC(ip, i*64+15) << 11;\ + IPPB(ip, i*64+16, parm); w |= (uint32_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); w |= (uint32_t)SRC(ip, i*64+17) << 21;\ + IPPB(ip, i*64+18, parm); w |= (uint32_t)SRC(ip, i*64+18) << 26;\ + IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 31;\ + IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 36;\ + IPPB(ip, i*64+21, parm); w |= (uint64_t)SRC(ip, i*64+21) << 41;\ + IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 46;\ + IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 51;\ + IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 56 | (uint64_t)SRC1(ip, i*64+25) << 61;*((uint64_t *)op+i*5+ 1) = w;\ + IPPB(ip, i*64+25, parm); w = (uint32_t)SRC(ip, i*64+25) >> 3;\ + IPPB(ip, i*64+26, parm); w |= (uint32_t)SRC(ip, i*64+26) << 2;\ + IPPB(ip, i*64+27, parm); w |= (uint32_t)SRC(ip, i*64+27) << 7;\ + IPPB(ip, i*64+28, parm); w |= (uint32_t)SRC(ip, i*64+28) << 12;\ + IPPB(ip, i*64+29, parm); w |= (uint32_t)SRC(ip, i*64+29) << 17;\ + IPPB(ip, i*64+30, parm); w |= (uint32_t)SRC(ip, i*64+30) << 22;\ + IPPB(ip, i*64+31, parm); w |= (uint32_t)SRC(ip, i*64+31) << 27;*((uint64_t *)op+i*5+ 2) = w;;\ +} + +#define BITPACK64_5(ip, op, parm) { \ + BITBLK64_5(ip, 0, op, parm); SRCI(ip); op += 5*4/sizeof(op[0]);\ +} + +#define BITBLK64_6(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 6;\ + IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 12;\ + IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 18;\ + IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 24;\ + IPPB(ip, i*32+ 5, parm); w |= (uint64_t)SRC(ip, i*32+ 5) << 30;\ + IPPB(ip, i*32+ 6, parm); w |= (uint64_t)SRC(ip, i*32+ 6) << 36;\ + IPPB(ip, i*32+ 7, parm); w |= (uint64_t)SRC(ip, i*32+ 7) << 42;\ + IPPB(ip, i*32+ 8, parm); w |= (uint64_t)SRC(ip, i*32+ 8) << 48;\ + IPPB(ip, i*32+ 9, parm); w |= (uint64_t)SRC(ip, i*32+ 9) << 54 | (uint64_t)SRC1(ip, i*32+10) << 60;*((uint64_t *)op+i*3+ 0) = w;\ + IPPB(ip, i*32+10, parm); w = (uint32_t)SRC(ip, i*32+10) >> 4;\ + IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 2;\ + IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 8;\ + IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 14;\ + IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 20;\ + IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 26;\ + IPPB(ip, i*32+16, parm); w |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); w |= (uint64_t)SRC(ip, i*32+17) << 38;\ + IPPB(ip, i*32+18, parm); w |= (uint64_t)SRC(ip, i*32+18) << 44;\ + IPPB(ip, i*32+19, parm); w |= (uint64_t)SRC(ip, i*32+19) << 50;\ + IPPB(ip, i*32+20, parm); w |= (uint64_t)SRC(ip, i*32+20) << 56 | (uint64_t)SRC1(ip, i*32+21) << 62;*((uint64_t *)op+i*3+ 1) = w;\ + IPPB(ip, i*32+21, parm); w = (uint32_t)SRC(ip, i*32+21) >> 2;\ + IPPB(ip, i*32+22, parm); w |= (uint32_t)SRC(ip, i*32+22) << 4;\ + IPPB(ip, i*32+23, parm); w |= (uint32_t)SRC(ip, i*32+23) << 10;\ + IPPB(ip, i*32+24, parm); w |= (uint32_t)SRC(ip, i*32+24) << 16;\ + IPPB(ip, i*32+25, parm); w |= (uint32_t)SRC(ip, i*32+25) << 22;\ + IPPB(ip, i*32+26, parm); w |= (uint64_t)SRC(ip, i*32+26) << 28;\ + IPPB(ip, i*32+27, parm); w |= (uint64_t)SRC(ip, i*32+27) << 34;\ + IPPB(ip, i*32+28, parm); w |= (uint64_t)SRC(ip, i*32+28) << 40;\ + IPPB(ip, i*32+29, parm); w |= (uint64_t)SRC(ip, i*32+29) << 46;\ + IPPB(ip, i*32+30, parm); w |= (uint64_t)SRC(ip, i*32+30) << 52;\ + IPPB(ip, i*32+31, parm); w |= (uint64_t)SRC(ip, i*32+31) << 58;*((uint64_t *)op+i*3+ 2) = w;;\ +} + +#define BITPACK64_6(ip, op, parm) { \ + BITBLK64_6(ip, 0, op, parm); SRCI(ip); op += 6*4/sizeof(op[0]);\ +} + +#define BITBLK64_7(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*64+ 0, parm); w = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); w |= (uint32_t)SRC(ip, i*64+ 1) << 7;\ + IPPB(ip, i*64+ 2, parm); w |= (uint32_t)SRC(ip, i*64+ 2) << 14;\ + IPPB(ip, i*64+ 3, parm); w |= (uint32_t)SRC(ip, i*64+ 3) << 21;\ + IPPB(ip, i*64+ 4, parm); w |= (uint64_t)SRC(ip, i*64+ 4) << 28;\ + IPPB(ip, i*64+ 5, parm); w |= (uint64_t)SRC(ip, i*64+ 5) << 35;\ + IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 42;\ + IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 49;\ + IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 56 | (uint64_t)SRC1(ip, i*64+9) << 63;*((uint64_t *)op+i*7+ 0) = w;\ + IPPB(ip, i*64+ 9, parm); w = (uint32_t)SRC(ip, i*64+ 9) >> 1;\ + IPPB(ip, i*64+10, parm); w |= (uint32_t)SRC(ip, i*64+10) << 6;\ + IPPB(ip, i*64+11, parm); w |= (uint32_t)SRC(ip, i*64+11) << 13;\ + IPPB(ip, i*64+12, parm); w |= (uint32_t)SRC(ip, i*64+12) << 20;\ + IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 27;\ + IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 34;\ + IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 41;\ + IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 48;\ + IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 55 | (uint64_t)SRC1(ip, i*64+18) << 62;*((uint64_t *)op+i*7+ 1) = w;\ + IPPB(ip, i*64+18, parm); w = (uint32_t)SRC(ip, i*64+18) >> 2;\ + IPPB(ip, i*64+19, parm); w |= (uint32_t)SRC(ip, i*64+19) << 5;\ + IPPB(ip, i*64+20, parm); w |= (uint32_t)SRC(ip, i*64+20) << 12;\ + IPPB(ip, i*64+21, parm); w |= (uint32_t)SRC(ip, i*64+21) << 19;\ + IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 26;\ + IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 33;\ + IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 40;\ + IPPB(ip, i*64+25, parm); w |= (uint64_t)SRC(ip, i*64+25) << 47;\ + IPPB(ip, i*64+26, parm); w |= (uint64_t)SRC(ip, i*64+26) << 54 | (uint64_t)SRC1(ip, i*64+27) << 61;*((uint64_t *)op+i*7+ 2) = w;\ + IPPB(ip, i*64+27, parm); w = (uint32_t)SRC(ip, i*64+27) >> 3;\ + IPPB(ip, i*64+28, parm); w |= (uint32_t)SRC(ip, i*64+28) << 4;\ + IPPB(ip, i*64+29, parm); w |= (uint32_t)SRC(ip, i*64+29) << 11;\ + IPPB(ip, i*64+30, parm); w |= (uint32_t)SRC(ip, i*64+30) << 18;\ + IPPB(ip, i*64+31, parm); w |= (uint32_t)SRC(ip, i*64+31) << 25;*((uint64_t *)op+i*7+ 3) = w;;\ +} + +#define BITPACK64_7(ip, op, parm) { \ + BITBLK64_7(ip, 0, op, parm); SRCI(ip); op += 7*4/sizeof(op[0]);\ +} + +#define BITBLK64_8(ip, i, op, parm) { ;\ + IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*1+ 0) = (uint32_t)SRC(ip, i*8+ 0) ;\ + IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*1+ 0) |= (uint32_t)SRC(ip, i*8+ 1) << 8;\ + IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*1+ 0) |= (uint32_t)SRC(ip, i*8+ 2) << 16;\ + IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*1+ 0) |= (uint32_t)SRC(ip, i*8+ 3) << 24;\ + IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 4) << 32;\ + IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 5) << 40;\ + IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 6) << 48;\ + IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 7) << 56;\ +} + +#define BITPACK64_8(ip, op, parm) { \ + BITBLK64_8(ip, 0, op, parm);\ + BITBLK64_8(ip, 1, op, parm);\ + BITBLK64_8(ip, 2, op, parm);\ + BITBLK64_8(ip, 3, op, parm); SRCI(ip); op += 8*4/sizeof(op[0]);\ +} + +#define BITBLK64_9(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*9+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 9;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*9+ 0) |= (uint32_t)SRC(ip, i*64+ 2) << 18;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 27;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 4) << 36;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 5) << 45;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 6) << 54 | (uint64_t)SRC1(ip, i*64+7) << 63;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*9+ 1) = (uint32_t)SRC(ip, i*64+ 7) >> 1;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*9+ 1) |= (uint32_t)SRC(ip, i*64+ 8) << 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*9+ 1) |= (uint32_t)SRC(ip, i*64+ 9) << 17;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+10) << 26;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+11) << 35;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+12) << 44;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+13) << 53 | (uint64_t)SRC1(ip, i*64+14) << 62;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*9+ 2) = (uint32_t)SRC(ip, i*64+14) >> 2;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*9+ 2) |= (uint32_t)SRC(ip, i*64+15) << 7;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*9+ 2) |= (uint32_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+17) << 25;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+18) << 34;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+19) << 43;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+20) << 52 | (uint64_t)SRC1(ip, i*64+21) << 61;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*9+ 3) = (uint32_t)SRC(ip, i*64+21) >> 3;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*9+ 3) |= (uint32_t)SRC(ip, i*64+22) << 6;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*9+ 3) |= (uint32_t)SRC(ip, i*64+23) << 15;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+24) << 24;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+25) << 33;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+26) << 42;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+27) << 51 | (uint64_t)SRC1(ip, i*64+28) << 60;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*9+ 4) = (uint32_t)SRC(ip, i*64+28) >> 4;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*64+29) << 5;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*64+30) << 14;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*64+31) << 23;\ +} + +#define BITPACK64_9(ip, op, parm) { \ + BITBLK64_9(ip, 0, op, parm); SRCI(ip); op += 9*4/sizeof(op[0]);\ +} + +#define BITBLK64_10(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*5+ 0) |= (uint32_t)SRC(ip, i*32+ 1) << 10;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*5+ 0) |= (uint32_t)SRC(ip, i*32+ 2) << 20;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 3) << 30;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 4) << 40;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 5) << 50 | (uint64_t)SRC1(ip, i*32+6) << 60;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*5+ 1) = (uint32_t)SRC(ip, i*32+ 6) >> 4;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*5+ 1) |= (uint32_t)SRC(ip, i*32+ 7) << 6;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*5+ 1) |= (uint32_t)SRC(ip, i*32+ 8) << 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+ 9) << 26;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+10) << 36;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+11) << 46 | (uint64_t)SRC1(ip, i*32+12) << 56;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*5+ 2) = (uint32_t)SRC(ip, i*32+12) >> 8;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*32+13) << 2;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*32+14) << 12;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*32+15) << 22;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+17) << 42;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+18) << 52 | (uint64_t)SRC1(ip, i*32+19) << 62;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*5+ 3) = (uint32_t)SRC(ip, i*32+19) >> 2;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*5+ 3) |= (uint32_t)SRC(ip, i*32+20) << 8;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*5+ 3) |= (uint32_t)SRC(ip, i*32+21) << 18;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+22) << 28;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+23) << 38;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+24) << 48 | (uint64_t)SRC1(ip, i*32+25) << 58;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*5+ 4) = (uint32_t)SRC(ip, i*32+25) >> 6;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*5+ 4) |= (uint32_t)SRC(ip, i*32+26) << 4;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*5+ 4) |= (uint32_t)SRC(ip, i*32+27) << 14;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+28) << 24;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+29) << 34;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+30) << 44;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+31) << 54;\ +} + +#define BITPACK64_10(ip, op, parm) { \ + BITBLK64_10(ip, 0, op, parm); SRCI(ip); op += 10*4/sizeof(op[0]);\ +} + +#define BITBLK64_11(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*11+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 11;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 22;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 33;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 4) << 44 | (uint64_t)SRC1(ip, i*64+5) << 55;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*11+ 1) = (uint32_t)SRC(ip, i*64+ 5) >> 9;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*11+ 1) |= (uint32_t)SRC(ip, i*64+ 6) << 2;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*11+ 1) |= (uint32_t)SRC(ip, i*64+ 7) << 13;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 9) << 35;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+10) << 46 | (uint64_t)SRC1(ip, i*64+11) << 57;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*11+ 2) = (uint32_t)SRC(ip, i*64+11) >> 7;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*11+ 2) |= (uint32_t)SRC(ip, i*64+12) << 4;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*11+ 2) |= (uint32_t)SRC(ip, i*64+13) << 15;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+14) << 26;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+15) << 37;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+16) << 48 | (uint64_t)SRC1(ip, i*64+17) << 59;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*11+ 3) = (uint32_t)SRC(ip, i*64+17) >> 5;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*11+ 3) |= (uint32_t)SRC(ip, i*64+18) << 6;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*11+ 3) |= (uint32_t)SRC(ip, i*64+19) << 17;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+20) << 28;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+21) << 39;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+22) << 50 | (uint64_t)SRC1(ip, i*64+23) << 61;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*11+ 4) = (uint32_t)SRC(ip, i*64+23) >> 3;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*11+ 4) |= (uint32_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*11+ 4) |= (uint32_t)SRC(ip, i*64+25) << 19;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+26) << 30;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+27) << 41;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+28) << 52 | (uint64_t)SRC1(ip, i*64+29) << 63;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*11+ 5) = (uint32_t)SRC(ip, i*64+29) >> 1;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*11+ 5) |= (uint32_t)SRC(ip, i*64+30) << 10;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*11+ 5) |= (uint32_t)SRC(ip, i*64+31) << 21;\ +} + +#define BITPACK64_11(ip, op, parm) { \ + BITBLK64_11(ip, 0, op, parm); SRCI(ip); op += 11*4/sizeof(op[0]);\ +} + +#define BITBLK64_12(ip, i, op, parm) { ;\ + IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*3+ 0) |= (uint32_t)SRC(ip, i*16+ 1) << 12;\ + IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 2) << 24;\ + IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 3) << 36;\ + IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 4) << 48 | (uint64_t)SRC1(ip, i*16+5) << 60;\ + IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*3+ 1) = (uint32_t)SRC(ip, i*16+ 5) >> 4;\ + IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*3+ 1) |= (uint32_t)SRC(ip, i*16+ 6) << 8;\ + IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*3+ 1) |= (uint32_t)SRC(ip, i*16+ 7) << 20;\ + IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 8) << 32;\ + IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 9) << 44 | (uint64_t)SRC1(ip, i*16+10) << 56;\ + IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*3+ 2) = (uint32_t)SRC(ip, i*16+10) >> 8;\ + IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*3+ 2) |= (uint32_t)SRC(ip, i*16+11) << 4;\ + IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*3+ 2) |= (uint32_t)SRC(ip, i*16+12) << 16;\ + IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+13) << 28;\ + IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+14) << 40;\ + IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+15) << 52;\ +} + +#define BITPACK64_12(ip, op, parm) { \ + BITBLK64_12(ip, 0, op, parm);\ + BITBLK64_12(ip, 1, op, parm); SRCI(ip); op += 12*4/sizeof(op[0]);\ +} + +#define BITBLK64_13(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*13+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 13;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 26;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 39 | (uint64_t)SRC1(ip, i*64+4) << 52;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*13+ 1) = (uint32_t)SRC(ip, i*64+ 4) >> 12;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*13+ 1) |= (uint32_t)SRC(ip, i*64+ 5) << 1;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*13+ 1) |= (uint32_t)SRC(ip, i*64+ 6) << 14;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 27;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 40 | (uint64_t)SRC1(ip, i*64+9) << 53;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*13+ 2) = (uint32_t)SRC(ip, i*64+ 9) >> 11;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*13+ 2) |= (uint32_t)SRC(ip, i*64+10) << 2;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*13+ 2) |= (uint32_t)SRC(ip, i*64+11) << 15;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+12) << 28;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+13) << 41 | (uint64_t)SRC1(ip, i*64+14) << 54;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*13+ 3) = (uint32_t)SRC(ip, i*64+14) >> 10;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*13+ 3) |= (uint32_t)SRC(ip, i*64+15) << 3;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*13+ 3) |= (uint32_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+17) << 29;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+18) << 42 | (uint64_t)SRC1(ip, i*64+19) << 55;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*13+ 4) = (uint32_t)SRC(ip, i*64+19) >> 9;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*13+ 4) |= (uint32_t)SRC(ip, i*64+20) << 4;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*13+ 4) |= (uint32_t)SRC(ip, i*64+21) << 17;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+22) << 30;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+23) << 43 | (uint64_t)SRC1(ip, i*64+24) << 56;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*13+ 5) = (uint32_t)SRC(ip, i*64+24) >> 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*13+ 5) |= (uint32_t)SRC(ip, i*64+25) << 5;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*13+ 5) |= (uint32_t)SRC(ip, i*64+26) << 18;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+27) << 31;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+28) << 44 | (uint64_t)SRC1(ip, i*64+29) << 57;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*13+ 6) = (uint32_t)SRC(ip, i*64+29) >> 7;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*13+ 6) |= (uint32_t)SRC(ip, i*64+30) << 6;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*13+ 6) |= (uint32_t)SRC(ip, i*64+31) << 19;\ +} + +#define BITPACK64_13(ip, op, parm) { \ + BITBLK64_13(ip, 0, op, parm); SRCI(ip); op += 13*4/sizeof(op[0]);\ +} + +#define BITBLK64_14(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*7+ 0) |= (uint32_t)SRC(ip, i*32+ 1) << 14;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 28;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 3) << 42 | (uint64_t)SRC1(ip, i*32+4) << 56;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*7+ 1) = (uint32_t)SRC(ip, i*32+ 4) >> 8;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*7+ 1) |= (uint32_t)SRC(ip, i*32+ 5) << 6;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 6) << 20;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 7) << 34;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 8) << 48 | (uint64_t)SRC1(ip, i*32+9) << 62;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*7+ 2) = (uint32_t)SRC(ip, i*32+ 9) >> 2;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*7+ 2) |= (uint32_t)SRC(ip, i*32+10) << 12;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+11) << 26;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+12) << 40 | (uint64_t)SRC1(ip, i*32+13) << 54;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*7+ 3) = (uint32_t)SRC(ip, i*32+13) >> 10;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*7+ 3) |= (uint32_t)SRC(ip, i*32+14) << 4;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*7+ 3) |= (uint32_t)SRC(ip, i*32+15) << 18;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+17) << 46 | (uint64_t)SRC1(ip, i*32+18) << 60;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*7+ 4) = (uint32_t)SRC(ip, i*32+18) >> 4;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*7+ 4) |= (uint32_t)SRC(ip, i*32+19) << 10;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+20) << 24;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+21) << 38 | (uint64_t)SRC1(ip, i*32+22) << 52;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*7+ 5) = (uint32_t)SRC(ip, i*32+22) >> 12;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*7+ 5) |= (uint32_t)SRC(ip, i*32+23) << 2;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*7+ 5) |= (uint32_t)SRC(ip, i*32+24) << 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+25) << 30;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+26) << 44 | (uint64_t)SRC1(ip, i*32+27) << 58;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*7+ 6) = (uint32_t)SRC(ip, i*32+27) >> 6;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*7+ 6) |= (uint32_t)SRC(ip, i*32+28) << 8;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+29) << 22;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+30) << 36;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+31) << 50;\ +} + +#define BITPACK64_14(ip, op, parm) { \ + BITBLK64_14(ip, 0, op, parm); SRCI(ip); op += 14*4/sizeof(op[0]);\ +} + +#define BITBLK64_15(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*15+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 15;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 30;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 45 | (uint64_t)SRC1(ip, i*64+4) << 60;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*15+ 1) = (uint32_t)SRC(ip, i*64+ 4) >> 4;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*15+ 1) |= (uint32_t)SRC(ip, i*64+ 5) << 11;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 26;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 41 | (uint64_t)SRC1(ip, i*64+8) << 56;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*15+ 2) = (uint32_t)SRC(ip, i*64+ 8) >> 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*15+ 2) |= (uint32_t)SRC(ip, i*64+ 9) << 7;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+10) << 22;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+11) << 37 | (uint64_t)SRC1(ip, i*64+12) << 52;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*15+ 3) = (uint32_t)SRC(ip, i*64+12) >> 12;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*15+ 3) |= (uint32_t)SRC(ip, i*64+13) << 3;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+14) << 18;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+15) << 33;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+16) << 48 | (uint64_t)SRC1(ip, i*64+17) << 63;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*15+ 4) = (uint32_t)SRC(ip, i*64+17) >> 1;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*15+ 4) |= (uint32_t)SRC(ip, i*64+18) << 14;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+19) << 29;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+20) << 44 | (uint64_t)SRC1(ip, i*64+21) << 59;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*15+ 5) = (uint32_t)SRC(ip, i*64+21) >> 5;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*15+ 5) |= (uint32_t)SRC(ip, i*64+22) << 10;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+23) << 25;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+24) << 40 | (uint64_t)SRC1(ip, i*64+25) << 55;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*15+ 6) = (uint32_t)SRC(ip, i*64+25) >> 9;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*15+ 6) |= (uint32_t)SRC(ip, i*64+26) << 6;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+27) << 21;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+28) << 36 | (uint64_t)SRC1(ip, i*64+29) << 51;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*15+ 7) = (uint32_t)SRC(ip, i*64+29) >> 13;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*15+ 7) |= (uint32_t)SRC(ip, i*64+30) << 2;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*15+ 7) |= (uint32_t)SRC(ip, i*64+31) << 17;\ +} + +#define BITPACK64_15(ip, op, parm) { \ + BITBLK64_15(ip, 0, op, parm); SRCI(ip); op += 15*4/sizeof(op[0]);\ +} + +#define BITBLK64_16(ip, i, op, parm) { \ + IPPB(ip, i*4+ 0, parm); *(uint16_t *)(op+i*8+ 0) = SRC(ip, i*4+ 0);\ + IPPB(ip, i*4+ 1, parm); *(uint16_t *)(op+i*8+ 2) = SRC(ip, i*4+ 1);\ + IPPB(ip, i*4+ 2, parm); *(uint16_t *)(op+i*8+ 4) = SRC(ip, i*4+ 2);\ + IPPB(ip, i*4+ 3, parm); *(uint16_t *)(op+i*8+ 6) = SRC(ip, i*4+ 3);;\ +} + +#define BITPACK64_16(ip, op, parm) { \ + BITBLK64_16(ip, 0, op, parm);\ + BITBLK64_16(ip, 1, op, parm);\ + BITBLK64_16(ip, 2, op, parm);\ + BITBLK64_16(ip, 3, op, parm);\ + BITBLK64_16(ip, 4, op, parm);\ + BITBLK64_16(ip, 5, op, parm);\ + BITBLK64_16(ip, 6, op, parm);\ + BITBLK64_16(ip, 7, op, parm); SRCI(ip); op += 16*4/sizeof(op[0]);\ +} + +#define BITBLK64_17(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*17+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*17+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 17;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*17+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 34 | (uint64_t)SRC1(ip, i*64+3) << 51;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*17+ 1) = (uint32_t)SRC(ip, i*64+ 3) >> 13;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*17+ 1) |= (uint32_t)SRC(ip, i*64+ 4) << 4;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 21;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 38 | (uint64_t)SRC1(ip, i*64+7) << 55;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*17+ 2) = (uint32_t)SRC(ip, i*64+ 7) >> 9;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*17+ 2) |= (uint32_t)SRC(ip, i*64+ 8) << 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 25;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+10) << 42 | (uint64_t)SRC1(ip, i*64+11) << 59;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*17+ 3) = (uint32_t)SRC(ip, i*64+11) >> 5;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*17+ 3) |= (uint32_t)SRC(ip, i*64+12) << 12;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+13) << 29;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+14) << 46 | (uint64_t)SRC1(ip, i*64+15) << 63;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*17+ 4) = (uint32_t)SRC(ip, i*64+15) >> 1;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*64+17) << 33 | (uint64_t)SRC1(ip, i*64+18) << 50;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*17+ 5) = (uint32_t)SRC(ip, i*64+18) >> 14;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*17+ 5) |= (uint32_t)SRC(ip, i*64+19) << 3;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+20) << 20;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+21) << 37 | (uint64_t)SRC1(ip, i*64+22) << 54;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*17+ 6) = (uint32_t)SRC(ip, i*64+22) >> 10;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*17+ 6) |= (uint32_t)SRC(ip, i*64+23) << 7;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+24) << 24;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+25) << 41 | (uint64_t)SRC1(ip, i*64+26) << 58;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*17+ 7) = (uint32_t)SRC(ip, i*64+26) >> 6;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*17+ 7) |= (uint32_t)SRC(ip, i*64+27) << 11;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+28) << 28;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+29) << 45 | (uint64_t)SRC1(ip, i*64+30) << 62;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*17+ 8) = (uint32_t)SRC(ip, i*64+30) >> 2;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*17+ 8) |= (uint32_t)SRC(ip, i*64+31) << 15;\ +} + +#define BITPACK64_17(ip, op, parm) { \ + BITBLK64_17(ip, 0, op, parm); SRCI(ip); op += 17*4/sizeof(op[0]);\ +} + +#define BITBLK64_18(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 18;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 36 | (uint64_t)SRC1(ip, i*32+3) << 54;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*9+ 1) = (uint32_t)SRC(ip, i*32+ 3) >> 10;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*9+ 1) |= (uint32_t)SRC(ip, i*32+ 4) << 8;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 5) << 26;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 6) << 44 | (uint64_t)SRC1(ip, i*32+7) << 62;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*9+ 2) = (uint32_t)SRC(ip, i*32+ 7) >> 2;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*32+ 8) << 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*32+ 9) << 34 | (uint64_t)SRC1(ip, i*32+10) << 52;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*9+ 3) = (uint32_t)SRC(ip, i*32+10) >> 12;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*9+ 3) |= (uint32_t)SRC(ip, i*32+11) << 6;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+12) << 24;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+13) << 42 | (uint64_t)SRC1(ip, i*32+14) << 60;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*9+ 4) = (uint32_t)SRC(ip, i*32+14) >> 4;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*32+15) << 14;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 50;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*9+ 5) = (uint32_t)SRC(ip, i*32+17) >> 14;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*9+ 5) |= (uint32_t)SRC(ip, i*32+18) << 4;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+19) << 22;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+20) << 40 | (uint64_t)SRC1(ip, i*32+21) << 58;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*9+ 6) = (uint32_t)SRC(ip, i*32+21) >> 6;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*9+ 6) |= (uint32_t)SRC(ip, i*32+22) << 12;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*9+ 6) |= (uint64_t)SRC(ip, i*32+23) << 30 | (uint64_t)SRC1(ip, i*32+24) << 48;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*9+ 7) = (uint32_t)SRC(ip, i*32+24) >> 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*9+ 7) |= (uint32_t)SRC(ip, i*32+25) << 2;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+26) << 20;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+27) << 38 | (uint64_t)SRC1(ip, i*32+28) << 56;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*9+ 8) = (uint32_t)SRC(ip, i*32+28) >> 8;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*9+ 8) |= (uint32_t)SRC(ip, i*32+29) << 10;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+30) << 28;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+31) << 46;\ +} + +#define BITPACK64_18(ip, op, parm) { \ + BITBLK64_18(ip, 0, op, parm); SRCI(ip); op += 18*4/sizeof(op[0]);\ +} + +#define BITBLK64_19(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*19+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*19+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 19;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*19+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 38 | (uint64_t)SRC1(ip, i*64+3) << 57;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*19+ 1) = (uint32_t)SRC(ip, i*64+ 3) >> 7;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*19+ 1) |= (uint32_t)SRC(ip, i*64+ 4) << 12;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*19+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 31 | (uint64_t)SRC1(ip, i*64+6) << 50;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*19+ 2) = (uint32_t)SRC(ip, i*64+ 6) >> 14;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*19+ 2) |= (uint32_t)SRC(ip, i*64+ 7) << 5;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 43 | (uint64_t)SRC1(ip, i*64+10) << 62;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*19+ 3) = (uint32_t)SRC(ip, i*64+10) >> 2;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*19+ 3) |= (uint64_t)SRC(ip, i*64+11) << 17;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*19+ 3) |= (uint64_t)SRC(ip, i*64+12) << 36 | (uint64_t)SRC1(ip, i*64+13) << 55;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*19+ 4) = (uint32_t)SRC(ip, i*64+13) >> 9;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*19+ 4) |= (uint32_t)SRC(ip, i*64+14) << 10;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*19+ 4) |= (uint64_t)SRC(ip, i*64+15) << 29 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*19+ 5) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*19+ 5) |= (uint32_t)SRC(ip, i*64+17) << 3;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+18) << 22;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+19) << 41 | (uint64_t)SRC1(ip, i*64+20) << 60;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*19+ 6) = (uint32_t)SRC(ip, i*64+20) >> 4;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*19+ 6) |= (uint64_t)SRC(ip, i*64+21) << 15;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*19+ 6) |= (uint64_t)SRC(ip, i*64+22) << 34 | (uint64_t)SRC1(ip, i*64+23) << 53;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*19+ 7) = (uint32_t)SRC(ip, i*64+23) >> 11;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*19+ 7) |= (uint32_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*19+ 7) |= (uint64_t)SRC(ip, i*64+25) << 27 | (uint64_t)SRC1(ip, i*64+26) << 46;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*19+ 8) = (uint32_t)SRC(ip, i*64+26) >> 18;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*19+ 8) |= (uint32_t)SRC(ip, i*64+27) << 1;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+28) << 20;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+29) << 39 | (uint64_t)SRC1(ip, i*64+30) << 58;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*19+ 9) = (uint32_t)SRC(ip, i*64+30) >> 6;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*19+ 9) |= (uint32_t)SRC(ip, i*64+31) << 13;\ +} + +#define BITPACK64_19(ip, op, parm) { \ + BITBLK64_19(ip, 0, op, parm); SRCI(ip); op += 19*4/sizeof(op[0]);\ +} + +#define BITBLK64_20(ip, i, op, parm) { ;\ + IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 20;\ + IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*16+ 2) << 40 | (uint64_t)SRC1(ip, i*16+3) << 60;\ + IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*5+ 1) = (uint32_t)SRC(ip, i*16+ 3) >> 4;\ + IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*16+ 4) << 16;\ + IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*16+ 5) << 36 | (uint64_t)SRC1(ip, i*16+6) << 56;\ + IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*5+ 2) = (uint32_t)SRC(ip, i*16+ 6) >> 8;\ + IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*16+ 7) << 12;\ + IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*16+ 8) << 32 | (uint64_t)SRC1(ip, i*16+9) << 52;\ + IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*5+ 3) = (uint32_t)SRC(ip, i*16+ 9) >> 12;\ + IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*5+ 3) |= (uint32_t)SRC(ip, i*16+10) << 8;\ + IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*16+11) << 28 | (uint64_t)SRC1(ip, i*16+12) << 48;\ + IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*5+ 4) = (uint32_t)SRC(ip, i*16+12) >> 16;\ + IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*5+ 4) |= (uint32_t)SRC(ip, i*16+13) << 4;\ + IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+14) << 24;\ + IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+15) << 44;\ +} + +#define BITPACK64_20(ip, op, parm) { \ + BITBLK64_20(ip, 0, op, parm);\ + BITBLK64_20(ip, 1, op, parm); SRCI(ip); op += 20*4/sizeof(op[0]);\ +} + +#define BITBLK64_21(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*21+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*21+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 21;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*21+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 42 | (uint64_t)SRC1(ip, i*64+3) << 63;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*21+ 1) = (uint32_t)SRC(ip, i*64+ 3) >> 1;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 20;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 41 | (uint64_t)SRC1(ip, i*64+6) << 62;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*21+ 2) = (uint32_t)SRC(ip, i*64+ 6) >> 2;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*21+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 19;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*21+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 40 | (uint64_t)SRC1(ip, i*64+9) << 61;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*21+ 3) = (uint32_t)SRC(ip, i*64+ 9) >> 3;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*64+10) << 18;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*64+11) << 39 | (uint64_t)SRC1(ip, i*64+12) << 60;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*21+ 4) = (uint32_t)SRC(ip, i*64+12) >> 4;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*21+ 4) |= (uint64_t)SRC(ip, i*64+13) << 17;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*21+ 4) |= (uint64_t)SRC(ip, i*64+14) << 38 | (uint64_t)SRC1(ip, i*64+15) << 59;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*21+ 5) = (uint32_t)SRC(ip, i*64+15) >> 5;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*64+17) << 37 | (uint64_t)SRC1(ip, i*64+18) << 58;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*21+ 6) = (uint32_t)SRC(ip, i*64+18) >> 6;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*21+ 6) |= (uint64_t)SRC(ip, i*64+19) << 15;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*21+ 6) |= (uint64_t)SRC(ip, i*64+20) << 36 | (uint64_t)SRC1(ip, i*64+21) << 57;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*21+ 7) = (uint32_t)SRC(ip, i*64+21) >> 7;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*64+22) << 14;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*64+23) << 35 | (uint64_t)SRC1(ip, i*64+24) << 56;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*21+ 8) = (uint32_t)SRC(ip, i*64+24) >> 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*21+ 8) |= (uint64_t)SRC(ip, i*64+25) << 13;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*21+ 8) |= (uint64_t)SRC(ip, i*64+26) << 34 | (uint64_t)SRC1(ip, i*64+27) << 55;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*21+ 9) = (uint32_t)SRC(ip, i*64+27) >> 9;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*64+28) << 12;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*64+29) << 33 | (uint64_t)SRC1(ip, i*64+30) << 54;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*21+10) = (uint32_t)SRC(ip, i*64+30) >> 10;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*21+10) |= (uint32_t)SRC(ip, i*64+31) << 11;\ +} + +#define BITPACK64_21(ip, op, parm) { \ + BITBLK64_21(ip, 0, op, parm); SRCI(ip); op += 21*4/sizeof(op[0]);\ +} + +#define BITBLK64_22(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 22 | (uint64_t)SRC1(ip, i*32+2) << 44;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*11+ 1) = (uint32_t)SRC(ip, i*32+ 2) >> 20;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*11+ 1) |= (uint32_t)SRC(ip, i*32+ 3) << 2;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*32+ 4) << 24 | (uint64_t)SRC1(ip, i*32+5) << 46;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*11+ 2) = (uint32_t)SRC(ip, i*32+ 5) >> 18;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*11+ 2) |= (uint32_t)SRC(ip, i*32+ 6) << 4;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*32+ 7) << 26 | (uint64_t)SRC1(ip, i*32+8) << 48;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*11+ 3) = (uint32_t)SRC(ip, i*32+ 8) >> 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*11+ 3) |= (uint32_t)SRC(ip, i*32+ 9) << 6;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*32+10) << 28 | (uint64_t)SRC1(ip, i*32+11) << 50;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*11+ 4) = (uint32_t)SRC(ip, i*32+11) >> 14;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*11+ 4) |= (uint32_t)SRC(ip, i*32+12) << 8;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*32+13) << 30 | (uint64_t)SRC1(ip, i*32+14) << 52;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*11+ 5) = (uint32_t)SRC(ip, i*32+14) >> 12;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*11+ 5) |= (uint32_t)SRC(ip, i*32+15) << 10;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*11+ 5) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 54;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*11+ 6) = (uint32_t)SRC(ip, i*32+17) >> 10;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*32+18) << 12;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*32+19) << 34 | (uint64_t)SRC1(ip, i*32+20) << 56;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*11+ 7) = (uint32_t)SRC(ip, i*32+20) >> 8;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*11+ 7) |= (uint64_t)SRC(ip, i*32+21) << 14;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*11+ 7) |= (uint64_t)SRC(ip, i*32+22) << 36 | (uint64_t)SRC1(ip, i*32+23) << 58;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*11+ 8) = (uint32_t)SRC(ip, i*32+23) >> 6;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*32+24) << 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*32+25) << 38 | (uint64_t)SRC1(ip, i*32+26) << 60;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*11+ 9) = (uint32_t)SRC(ip, i*32+26) >> 4;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*11+ 9) |= (uint64_t)SRC(ip, i*32+27) << 18;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*11+ 9) |= (uint64_t)SRC(ip, i*32+28) << 40 | (uint64_t)SRC1(ip, i*32+29) << 62;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*11+10) = (uint32_t)SRC(ip, i*32+29) >> 2;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*32+30) << 20;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*32+31) << 42;\ +} + +#define BITPACK64_22(ip, op, parm) { \ + BITBLK64_22(ip, 0, op, parm); SRCI(ip); op += 22*4/sizeof(op[0]);\ +} + +#define BITBLK64_23(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*23+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*23+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 23 | (uint64_t)SRC1(ip, i*64+2) << 46;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*23+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 18;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*23+ 1) |= (uint32_t)SRC(ip, i*64+ 3) << 5;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*23+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 28 | (uint64_t)SRC1(ip, i*64+5) << 51;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*23+ 2) = (uint32_t)SRC(ip, i*64+ 5) >> 13;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 10;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 33 | (uint64_t)SRC1(ip, i*64+8) << 56;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*23+ 3) = (uint32_t)SRC(ip, i*64+ 8) >> 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*23+ 3) |= (uint64_t)SRC(ip, i*64+ 9) << 15;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*23+ 3) |= (uint64_t)SRC(ip, i*64+10) << 38 | (uint64_t)SRC1(ip, i*64+11) << 61;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*23+ 4) = (uint32_t)SRC(ip, i*64+11) >> 3;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*23+ 4) |= (uint64_t)SRC(ip, i*64+12) << 20 | (uint64_t)SRC1(ip, i*64+13) << 43;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*23+ 5) = (uint32_t)SRC(ip, i*64+13) >> 21;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*23+ 5) |= (uint32_t)SRC(ip, i*64+14) << 2;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*23+ 5) |= (uint64_t)SRC(ip, i*64+15) << 25 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*23+ 6) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*23+ 6) |= (uint32_t)SRC(ip, i*64+17) << 7;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*23+ 6) |= (uint64_t)SRC(ip, i*64+18) << 30 | (uint64_t)SRC1(ip, i*64+19) << 53;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*23+ 7) = (uint32_t)SRC(ip, i*64+19) >> 11;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*64+20) << 12;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*64+21) << 35 | (uint64_t)SRC1(ip, i*64+22) << 58;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*23+ 8) = (uint32_t)SRC(ip, i*64+22) >> 6;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*23+ 8) |= (uint64_t)SRC(ip, i*64+23) << 17;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*23+ 8) |= (uint64_t)SRC(ip, i*64+24) << 40 | (uint64_t)SRC1(ip, i*64+25) << 63;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*23+ 9) = (uint32_t)SRC(ip, i*64+25) >> 1;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*23+ 9) |= (uint64_t)SRC(ip, i*64+26) << 22 | (uint64_t)SRC1(ip, i*64+27) << 45;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*23+10) = (uint32_t)SRC(ip, i*64+27) >> 19;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*23+10) |= (uint32_t)SRC(ip, i*64+28) << 4;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*23+10) |= (uint64_t)SRC(ip, i*64+29) << 27 | (uint64_t)SRC1(ip, i*64+30) << 50;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*23+11) = (uint32_t)SRC(ip, i*64+30) >> 14;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*23+11) |= (uint32_t)SRC(ip, i*64+31) << 9;\ +} + +#define BITPACK64_23(ip, op, parm) { \ + BITBLK64_23(ip, 0, op, parm); SRCI(ip); op += 23*4/sizeof(op[0]);\ +} + +#define BITBLK64_24(ip, i, op, parm) { ;\ + IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint32_t)SRC(ip, i*8+ 0) ;\ + IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*8+ 1) << 24 | (uint64_t)SRC1(ip, i*8+2) << 48;\ + IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*3+ 1) = (uint32_t)SRC(ip, i*8+ 2) >> 16;\ + IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*3+ 1) |= (uint32_t)SRC(ip, i*8+ 3) << 8;\ + IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*8+ 4) << 32 | (uint64_t)SRC1(ip, i*8+5) << 56;\ + IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*3+ 2) = (uint32_t)SRC(ip, i*8+ 5) >> 8;\ + IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*8+ 6) << 16;\ + IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*8+ 7) << 40;\ +} + +#define BITPACK64_24(ip, op, parm) { \ + BITBLK64_24(ip, 0, op, parm);\ + BITBLK64_24(ip, 1, op, parm);\ + BITBLK64_24(ip, 2, op, parm);\ + BITBLK64_24(ip, 3, op, parm); SRCI(ip); op += 24*4/sizeof(op[0]);\ +} + +#define BITBLK64_25(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*25+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*25+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 25 | (uint64_t)SRC1(ip, i*64+2) << 50;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*25+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 14;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*25+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 11;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*25+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 36 | (uint64_t)SRC1(ip, i*64+5) << 61;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*25+ 2) = (uint32_t)SRC(ip, i*64+ 5) >> 3;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*25+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 22 | (uint64_t)SRC1(ip, i*64+7) << 47;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*25+ 3) = (uint32_t)SRC(ip, i*64+ 7) >> 17;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*64+ 8) << 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*64+ 9) << 33 | (uint64_t)SRC1(ip, i*64+10) << 58;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*25+ 4) = (uint32_t)SRC(ip, i*64+10) >> 6;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*25+ 4) |= (uint64_t)SRC(ip, i*64+11) << 19 | (uint64_t)SRC1(ip, i*64+12) << 44;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*25+ 5) = (uint32_t)SRC(ip, i*64+12) >> 20;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*25+ 5) |= (uint32_t)SRC(ip, i*64+13) << 5;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*25+ 5) |= (uint64_t)SRC(ip, i*64+14) << 30 | (uint64_t)SRC1(ip, i*64+15) << 55;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*25+ 6) = (uint32_t)SRC(ip, i*64+15) >> 9;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*25+ 6) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 41;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*25+ 7) = (uint32_t)SRC(ip, i*64+17) >> 23;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*25+ 7) |= (uint32_t)SRC(ip, i*64+18) << 2;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*25+ 7) |= (uint64_t)SRC(ip, i*64+19) << 27 | (uint64_t)SRC1(ip, i*64+20) << 52;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*25+ 8) = (uint32_t)SRC(ip, i*64+20) >> 12;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*25+ 8) |= (uint64_t)SRC(ip, i*64+21) << 13;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*25+ 8) |= (uint64_t)SRC(ip, i*64+22) << 38 | (uint64_t)SRC1(ip, i*64+23) << 63;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*25+ 9) = (uint32_t)SRC(ip, i*64+23) >> 1;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*25+ 9) |= (uint64_t)SRC(ip, i*64+24) << 24 | (uint64_t)SRC1(ip, i*64+25) << 49;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*25+10) = (uint32_t)SRC(ip, i*64+25) >> 15;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*64+26) << 10;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*64+27) << 35 | (uint64_t)SRC1(ip, i*64+28) << 60;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*25+11) = (uint32_t)SRC(ip, i*64+28) >> 4;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*25+11) |= (uint64_t)SRC(ip, i*64+29) << 21 | (uint64_t)SRC1(ip, i*64+30) << 46;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*25+12) = (uint32_t)SRC(ip, i*64+30) >> 18;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*25+12) |= (uint32_t)SRC(ip, i*64+31) << 7;\ +} + +#define BITPACK64_25(ip, op, parm) { \ + BITBLK64_25(ip, 0, op, parm); SRCI(ip); op += 25*4/sizeof(op[0]);\ +} + +#define BITBLK64_26(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 26 | (uint64_t)SRC1(ip, i*32+2) << 52;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*13+ 1) = (uint32_t)SRC(ip, i*32+ 2) >> 12;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 14 | (uint64_t)SRC1(ip, i*32+4) << 40;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*13+ 2) = (uint32_t)SRC(ip, i*32+ 4) >> 24;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*13+ 2) |= (uint32_t)SRC(ip, i*32+ 5) << 2;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*32+ 6) << 28 | (uint64_t)SRC1(ip, i*32+7) << 54;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*13+ 3) = (uint32_t)SRC(ip, i*32+ 7) >> 10;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*32+ 8) << 16 | (uint64_t)SRC1(ip, i*32+9) << 42;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*13+ 4) = (uint32_t)SRC(ip, i*32+ 9) >> 22;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*13+ 4) |= (uint32_t)SRC(ip, i*32+10) << 4;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*32+11) << 30 | (uint64_t)SRC1(ip, i*32+12) << 56;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*13+ 5) = (uint32_t)SRC(ip, i*32+12) >> 8;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*32+13) << 18 | (uint64_t)SRC1(ip, i*32+14) << 44;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*13+ 6) = (uint32_t)SRC(ip, i*32+14) >> 20;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*13+ 6) |= (uint32_t)SRC(ip, i*32+15) << 6;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*13+ 6) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 58;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*13+ 7) = (uint32_t)SRC(ip, i*32+17) >> 6;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*13+ 7) |= (uint64_t)SRC(ip, i*32+18) << 20 | (uint64_t)SRC1(ip, i*32+19) << 46;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*13+ 8) = (uint32_t)SRC(ip, i*32+19) >> 18;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*32+20) << 8;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*32+21) << 34 | (uint64_t)SRC1(ip, i*32+22) << 60;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*13+ 9) = (uint32_t)SRC(ip, i*32+22) >> 4;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*13+ 9) |= (uint64_t)SRC(ip, i*32+23) << 22 | (uint64_t)SRC1(ip, i*32+24) << 48;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*13+10) = (uint32_t)SRC(ip, i*32+24) >> 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*13+10) |= (uint64_t)SRC(ip, i*32+25) << 10;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*13+10) |= (uint64_t)SRC(ip, i*32+26) << 36 | (uint64_t)SRC1(ip, i*32+27) << 62;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*13+11) = (uint32_t)SRC(ip, i*32+27) >> 2;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*13+11) |= (uint64_t)SRC(ip, i*32+28) << 24 | (uint64_t)SRC1(ip, i*32+29) << 50;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*13+12) = (uint32_t)SRC(ip, i*32+29) >> 14;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*32+30) << 12;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*32+31) << 38;\ +} + +#define BITPACK64_26(ip, op, parm) { \ + BITBLK64_26(ip, 0, op, parm); SRCI(ip); op += 26*4/sizeof(op[0]);\ +} + +#define BITBLK64_27(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*27+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*27+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 27 | (uint64_t)SRC1(ip, i*64+2) << 54;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*27+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 10;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*27+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 17 | (uint64_t)SRC1(ip, i*64+4) << 44;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*27+ 2) = (uint32_t)SRC(ip, i*64+ 4) >> 20;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*27+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 7;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*27+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 34 | (uint64_t)SRC1(ip, i*64+7) << 61;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*27+ 3) = (uint32_t)SRC(ip, i*64+ 7) >> 3;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*27+ 3) |= (uint64_t)SRC(ip, i*64+ 8) << 24 | (uint64_t)SRC1(ip, i*64+9) << 51;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*27+ 4) = (uint32_t)SRC(ip, i*64+ 9) >> 13;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*27+ 4) |= (uint64_t)SRC(ip, i*64+10) << 14 | (uint64_t)SRC1(ip, i*64+11) << 41;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*27+ 5) = (uint32_t)SRC(ip, i*64+11) >> 23;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*27+ 5) |= (uint32_t)SRC(ip, i*64+12) << 4;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*27+ 5) |= (uint64_t)SRC(ip, i*64+13) << 31 | (uint64_t)SRC1(ip, i*64+14) << 58;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*27+ 6) = (uint32_t)SRC(ip, i*64+14) >> 6;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*27+ 6) |= (uint64_t)SRC(ip, i*64+15) << 21 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*27+ 7) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*27+ 7) |= (uint64_t)SRC(ip, i*64+17) << 11 | (uint64_t)SRC1(ip, i*64+18) << 38;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*27+ 8) = (uint32_t)SRC(ip, i*64+18) >> 26;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*27+ 8) |= (uint32_t)SRC(ip, i*64+19) << 1;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*27+ 8) |= (uint64_t)SRC(ip, i*64+20) << 28 | (uint64_t)SRC1(ip, i*64+21) << 55;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*27+ 9) = (uint32_t)SRC(ip, i*64+21) >> 9;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*27+ 9) |= (uint64_t)SRC(ip, i*64+22) << 18 | (uint64_t)SRC1(ip, i*64+23) << 45;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*27+10) = (uint32_t)SRC(ip, i*64+23) >> 19;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*64+25) << 35 | (uint64_t)SRC1(ip, i*64+26) << 62;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*27+11) = (uint32_t)SRC(ip, i*64+26) >> 2;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*27+11) |= (uint64_t)SRC(ip, i*64+27) << 25 | (uint64_t)SRC1(ip, i*64+28) << 52;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*27+12) = (uint32_t)SRC(ip, i*64+28) >> 12;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*27+12) |= (uint64_t)SRC(ip, i*64+29) << 15 | (uint64_t)SRC1(ip, i*64+30) << 42;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*27+13) = (uint32_t)SRC(ip, i*64+30) >> 22;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*27+13) |= (uint32_t)SRC(ip, i*64+31) << 5;\ +} + +#define BITPACK64_27(ip, op, parm) { \ + BITBLK64_27(ip, 0, op, parm); SRCI(ip); op += 27*4/sizeof(op[0]);\ +} + +#define BITBLK64_28(ip, i, op, parm) { ;\ + IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 28 | (uint64_t)SRC1(ip, i*16+2) << 56;\ + IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*7+ 1) = (uint32_t)SRC(ip, i*16+ 2) >> 8;\ + IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*16+ 3) << 20 | (uint64_t)SRC1(ip, i*16+4) << 48;\ + IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*7+ 2) = (uint32_t)SRC(ip, i*16+ 4) >> 16;\ + IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*16+ 5) << 12 | (uint64_t)SRC1(ip, i*16+6) << 40;\ + IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*7+ 3) = (uint32_t)SRC(ip, i*16+ 6) >> 24;\ + IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*7+ 3) |= (uint32_t)SRC(ip, i*16+ 7) << 4;\ + IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*16+ 8) << 32 | (uint64_t)SRC1(ip, i*16+9) << 60;\ + IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*7+ 4) = (uint32_t)SRC(ip, i*16+ 9) >> 4;\ + IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*16+10) << 24 | (uint64_t)SRC1(ip, i*16+11) << 52;\ + IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*7+ 5) = (uint32_t)SRC(ip, i*16+11) >> 12;\ + IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*16+12) << 16 | (uint64_t)SRC1(ip, i*16+13) << 44;\ + IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*7+ 6) = (uint32_t)SRC(ip, i*16+13) >> 20;\ + IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*16+14) << 8;\ + IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*16+15) << 36;\ +} + +#define BITPACK64_28(ip, op, parm) { \ + BITBLK64_28(ip, 0, op, parm);\ + BITBLK64_28(ip, 1, op, parm); SRCI(ip); op += 28*4/sizeof(op[0]);\ +} + +#define BITBLK64_29(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*29+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*29+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 29 | (uint64_t)SRC1(ip, i*64+2) << 58;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*29+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 6;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*29+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 23 | (uint64_t)SRC1(ip, i*64+4) << 52;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*29+ 2) = (uint32_t)SRC(ip, i*64+ 4) >> 12;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*29+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 17 | (uint64_t)SRC1(ip, i*64+6) << 46;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*29+ 3) = (uint32_t)SRC(ip, i*64+ 6) >> 18;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*29+ 3) |= (uint64_t)SRC(ip, i*64+ 7) << 11 | (uint64_t)SRC1(ip, i*64+8) << 40;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*29+ 4) = (uint32_t)SRC(ip, i*64+ 8) >> 24;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*29+ 4) |= (uint64_t)SRC(ip, i*64+ 9) << 5;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*29+ 4) |= (uint64_t)SRC(ip, i*64+10) << 34 | (uint64_t)SRC1(ip, i*64+11) << 63;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*29+ 5) = (uint32_t)SRC(ip, i*64+11) >> 1;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*29+ 5) |= (uint64_t)SRC(ip, i*64+12) << 28 | (uint64_t)SRC1(ip, i*64+13) << 57;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*29+ 6) = (uint32_t)SRC(ip, i*64+13) >> 7;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*29+ 6) |= (uint64_t)SRC(ip, i*64+14) << 22 | (uint64_t)SRC1(ip, i*64+15) << 51;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*29+ 7) = (uint32_t)SRC(ip, i*64+15) >> 13;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*29+ 7) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 45;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*29+ 8) = (uint32_t)SRC(ip, i*64+17) >> 19;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*29+ 8) |= (uint64_t)SRC(ip, i*64+18) << 10 | (uint64_t)SRC1(ip, i*64+19) << 39;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*29+ 9) = (uint32_t)SRC(ip, i*64+19) >> 25;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*64+20) << 4;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*64+21) << 33 | (uint64_t)SRC1(ip, i*64+22) << 62;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*29+10) = (uint32_t)SRC(ip, i*64+22) >> 2;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*29+10) |= (uint64_t)SRC(ip, i*64+23) << 27 | (uint64_t)SRC1(ip, i*64+24) << 56;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*29+11) = (uint32_t)SRC(ip, i*64+24) >> 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*29+11) |= (uint64_t)SRC(ip, i*64+25) << 21 | (uint64_t)SRC1(ip, i*64+26) << 50;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*29+12) = (uint32_t)SRC(ip, i*64+26) >> 14;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*29+12) |= (uint64_t)SRC(ip, i*64+27) << 15 | (uint64_t)SRC1(ip, i*64+28) << 44;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*29+13) = (uint32_t)SRC(ip, i*64+28) >> 20;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*29+13) |= (uint64_t)SRC(ip, i*64+29) << 9 | (uint64_t)SRC1(ip, i*64+30) << 38;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*29+14) = (uint32_t)SRC(ip, i*64+30) >> 26;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*29+14) |= (uint32_t)SRC(ip, i*64+31) << 3;\ +} + +#define BITPACK64_29(ip, op, parm) { \ + BITBLK64_29(ip, 0, op, parm); SRCI(ip); op += 29*4/sizeof(op[0]);\ +} + +#define BITBLK64_30(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 30 | (uint64_t)SRC1(ip, i*32+2) << 60;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*15+ 1) = (uint32_t)SRC(ip, i*32+ 2) >> 4;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 26 | (uint64_t)SRC1(ip, i*32+4) << 56;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*15+ 2) = (uint32_t)SRC(ip, i*32+ 4) >> 8;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*32+ 5) << 22 | (uint64_t)SRC1(ip, i*32+6) << 52;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*15+ 3) = (uint32_t)SRC(ip, i*32+ 6) >> 12;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*32+ 7) << 18 | (uint64_t)SRC1(ip, i*32+8) << 48;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*15+ 4) = (uint32_t)SRC(ip, i*32+ 8) >> 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*32+ 9) << 14 | (uint64_t)SRC1(ip, i*32+10) << 44;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*15+ 5) = (uint32_t)SRC(ip, i*32+10) >> 20;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*32+11) << 10 | (uint64_t)SRC1(ip, i*32+12) << 40;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*15+ 6) = (uint32_t)SRC(ip, i*32+12) >> 24;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*32+13) << 6 | (uint64_t)SRC1(ip, i*32+14) << 36;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*15+ 7) = (uint32_t)SRC(ip, i*32+14) >> 28;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*15+ 7) |= (uint32_t)SRC(ip, i*32+15) << 2;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*15+ 7) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 62;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*15+ 8) = (uint32_t)SRC(ip, i*32+17) >> 2;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*15+ 8) |= (uint64_t)SRC(ip, i*32+18) << 28 | (uint64_t)SRC1(ip, i*32+19) << 58;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*15+ 9) = (uint32_t)SRC(ip, i*32+19) >> 6;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*15+ 9) |= (uint64_t)SRC(ip, i*32+20) << 24 | (uint64_t)SRC1(ip, i*32+21) << 54;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*15+10) = (uint32_t)SRC(ip, i*32+21) >> 10;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*15+10) |= (uint64_t)SRC(ip, i*32+22) << 20 | (uint64_t)SRC1(ip, i*32+23) << 50;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*15+11) = (uint32_t)SRC(ip, i*32+23) >> 14;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*15+11) |= (uint64_t)SRC(ip, i*32+24) << 16 | (uint64_t)SRC1(ip, i*32+25) << 46;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*15+12) = (uint32_t)SRC(ip, i*32+25) >> 18;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*15+12) |= (uint64_t)SRC(ip, i*32+26) << 12 | (uint64_t)SRC1(ip, i*32+27) << 42;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*15+13) = (uint32_t)SRC(ip, i*32+27) >> 22;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*15+13) |= (uint64_t)SRC(ip, i*32+28) << 8 | (uint64_t)SRC1(ip, i*32+29) << 38;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*15+14) = (uint32_t)SRC(ip, i*32+29) >> 26;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*32+30) << 4;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*32+31) << 34;\ +} + +#define BITPACK64_30(ip, op, parm) { \ + BITBLK64_30(ip, 0, op, parm); SRCI(ip); op += 30*4/sizeof(op[0]);\ +} + +#define BITBLK64_31(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*31+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*31+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 31 | (uint64_t)SRC1(ip, i*64+2) << 62;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*31+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 2;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*31+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 29 | (uint64_t)SRC1(ip, i*64+4) << 60;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*31+ 2) = (uint32_t)SRC(ip, i*64+ 4) >> 4;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*31+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 27 | (uint64_t)SRC1(ip, i*64+6) << 58;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*31+ 3) = (uint32_t)SRC(ip, i*64+ 6) >> 6;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*31+ 3) |= (uint64_t)SRC(ip, i*64+ 7) << 25 | (uint64_t)SRC1(ip, i*64+8) << 56;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*31+ 4) = (uint32_t)SRC(ip, i*64+ 8) >> 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*31+ 4) |= (uint64_t)SRC(ip, i*64+ 9) << 23 | (uint64_t)SRC1(ip, i*64+10) << 54;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*31+ 5) = (uint32_t)SRC(ip, i*64+10) >> 10;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*31+ 5) |= (uint64_t)SRC(ip, i*64+11) << 21 | (uint64_t)SRC1(ip, i*64+12) << 52;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*31+ 6) = (uint32_t)SRC(ip, i*64+12) >> 12;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*31+ 6) |= (uint64_t)SRC(ip, i*64+13) << 19 | (uint64_t)SRC1(ip, i*64+14) << 50;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*31+ 7) = (uint32_t)SRC(ip, i*64+14) >> 14;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*31+ 7) |= (uint64_t)SRC(ip, i*64+15) << 17 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*31+ 8) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*31+ 8) |= (uint64_t)SRC(ip, i*64+17) << 15 | (uint64_t)SRC1(ip, i*64+18) << 46;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*31+ 9) = (uint32_t)SRC(ip, i*64+18) >> 18;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*31+ 9) |= (uint64_t)SRC(ip, i*64+19) << 13 | (uint64_t)SRC1(ip, i*64+20) << 44;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*31+10) = (uint32_t)SRC(ip, i*64+20) >> 20;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*31+10) |= (uint64_t)SRC(ip, i*64+21) << 11 | (uint64_t)SRC1(ip, i*64+22) << 42;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*31+11) = (uint32_t)SRC(ip, i*64+22) >> 22;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*31+11) |= (uint64_t)SRC(ip, i*64+23) << 9 | (uint64_t)SRC1(ip, i*64+24) << 40;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*31+12) = (uint32_t)SRC(ip, i*64+24) >> 24;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*31+12) |= (uint64_t)SRC(ip, i*64+25) << 7 | (uint64_t)SRC1(ip, i*64+26) << 38;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*31+13) = (uint32_t)SRC(ip, i*64+26) >> 26;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*31+13) |= (uint64_t)SRC(ip, i*64+27) << 5 | (uint64_t)SRC1(ip, i*64+28) << 36;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*31+14) = (uint32_t)SRC(ip, i*64+28) >> 28;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*31+14) |= (uint64_t)SRC(ip, i*64+29) << 3 | (uint64_t)SRC1(ip, i*64+30) << 34;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*31+15) = (uint32_t)SRC(ip, i*64+30) >> 30;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*31+15) |= (uint32_t)SRC(ip, i*64+31) << 1;\ +} + +#define BITPACK64_31(ip, op, parm) { \ + BITBLK64_31(ip, 0, op, parm); SRCI(ip); op += 31*4/sizeof(op[0]);\ +} + +#define BITBLK64_32(ip, i, op, parm) { \ + IPPB(ip, i*2+ 0, parm); *(uint32_t *)(op+i*8+ 0) = SRC(ip, i*2+ 0);\ + IPPB(ip, i*2+ 1, parm); *(uint32_t *)(op+i*8+ 4) = SRC(ip, i*2+ 1);;\ +} + +#define BITPACK64_32(ip, op, parm) { \ + BITBLK64_32(ip, 0, op, parm);\ + BITBLK64_32(ip, 1, op, parm);\ + BITBLK64_32(ip, 2, op, parm);\ + BITBLK64_32(ip, 3, op, parm);\ + BITBLK64_32(ip, 4, op, parm);\ + BITBLK64_32(ip, 5, op, parm);\ + BITBLK64_32(ip, 6, op, parm);\ + BITBLK64_32(ip, 7, op, parm);\ + BITBLK64_32(ip, 8, op, parm);\ + BITBLK64_32(ip, 9, op, parm);\ + BITBLK64_32(ip, 10, op, parm);\ + BITBLK64_32(ip, 11, op, parm);\ + BITBLK64_32(ip, 12, op, parm);\ + BITBLK64_32(ip, 13, op, parm);\ + BITBLK64_32(ip, 14, op, parm);\ + BITBLK64_32(ip, 15, op, parm); SRCI(ip); op += 32*4/sizeof(op[0]);\ +} + diff --git a/bitpack_.h b/bitpack_.h new file mode 100644 index 0000000..3fcabd2 --- /dev/null +++ b/bitpack_.h @@ -0,0 +1,200 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack_.h - "Integer Compression" binary packing +**/ + +#include +#define USE_BITPACK 64 +#if 0 +#define SRCI(__ip) __ip+=32 +#define SRC(__ip,__x) __ip[__x] +#define SRC1(__ip,__x) __ip[__x] +//#define SRCP( __ip) +#else +#define SRCI(__ip) +#define SRC1(__ip,__x) (*(__ip/*+1*/)) +#define SRC( __ip,__x) (*__ip++) +//#define SRCP( __ip) (__ip++) +#endif + + #if USE_BITPACK == 64 +#include "bitpack64_.h" +#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0:__ip = __ipe; break;\ + case 1:do BITPACK64_1( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 2:do BITPACK64_2( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 3:do BITPACK64_3( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 4:do BITPACK64_4( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 5:do BITPACK64_5( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 6:do BITPACK64_6( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 7:do BITPACK64_7( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 8:do BITPACK64_8( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 9:do BITPACK64_9( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 10:do BITPACK64_10(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 11:do BITPACK64_11(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 12:do BITPACK64_12(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 13:do BITPACK64_13(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 14:do BITPACK64_14(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 15:do BITPACK64_15(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 16:do BITPACK64_16(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 17:do BITPACK64_17(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 18:do BITPACK64_18(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 19:do BITPACK64_19(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 20:do BITPACK64_20(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 21:do BITPACK64_21(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 22:do BITPACK64_22(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 23:do BITPACK64_23(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 24:do BITPACK64_24(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 25:do BITPACK64_25(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 26:do BITPACK64_26(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 27:do BITPACK64_27(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 28:do BITPACK64_28(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 29:do BITPACK64_29(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 30:do BITPACK64_30(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 31:do BITPACK64_31(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 32:do BITPACK64_32(__ip, __op, __parm) while(__ip < __ipe);\ + }\ +} while(0) + #elif USE_BITPACK == 32 +#include "bitpack32_.h" +#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0:__ip = __ipe; break;\ + case 1:do BITPACK32_1( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 2:do BITPACK32_2( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 3:do BITPACK32_3( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 4:do BITPACK32_4( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 5:do BITPACK32_5( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 6:do BITPACK32_6( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 7:do BITPACK32_7( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 8:do BITPACK32_8( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 9:do BITPACK32_9( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 10:do BITPACK32_10(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 11:do BITPACK32_11(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 12:do BITPACK32_12(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 13:do BITPACK32_13(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 14:do BITPACK32_14(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 15:do BITPACK32_15(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 16:do BITPACK32_16(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 17:do BITPACK32_17(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 18:do BITPACK32_18(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 19:do BITPACK32_19(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 20:do BITPACK32_20(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 21:do BITPACK32_21(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 22:do BITPACK32_22(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 23:do BITPACK32_23(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 24:do BITPACK32_24(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 25:do BITPACK32_25(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 26:do BITPACK32_26(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 27:do BITPACK32_27(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 28:do BITPACK32_28(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 29:do BITPACK32_29(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 30:do BITPACK32_30(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 31:do BITPACK32_31(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 32:do BITPACK32_32(__ip, __op, __parm) while(__ip < __ipe);\ + } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\ +} while(0) + #else + #if 1 +#define SRCI(__ip) __ip+=32 +#define SRC(__ip,__x) __ip[__x] +#define SRCP( __ip) + #else +#define SRCI(__ip) +#define SRC( __ip,__x) (*__ip++) +//#define SRCP( __ip) (__ip++) + #endif +#include "pack/bitpack32_1.h" +#include "pack/bitpack32_2.h" +#include "pack/bitpack32_3.h" +#include "pack/bitpack32_4.h" +#include "pack/bitpack32_5.h" +#include "pack/bitpack32_6.h" +#include "pack/bitpack32_7.h" +#include "pack/bitpack32_8.h" +#include "pack/bitpack32_9.h" +#include "pack/bitpack32_10.h" +#include "pack/bitpack32_11.h" +#include "pack/bitpack32_12.h" +#include "pack/bitpack32_13.h" +#include "pack/bitpack32_14.h" +#include "pack/bitpack32_15.h" +#include "pack/bitpack32_16.h" +#include "pack/bitpack32_17.h" +#include "pack/bitpack32_18.h" +#include "pack/bitpack32_19.h" +#include "pack/bitpack32_20.h" +#include "pack/bitpack32_21.h" +#include "pack/bitpack32_22.h" +#include "pack/bitpack32_23.h" +#include "pack/bitpack32_24.h" +#include "pack/bitpack32_25.h" +#include "pack/bitpack32_26.h" +#include "pack/bitpack32_27.h" +#include "pack/bitpack32_28.h" +#include "pack/bitpack32_29.h" +#include "pack/bitpack32_30.h" +#include "pack/bitpack32_31.h" +#include "pack/bitpack32_32.h" +#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0:__ip = __ipe; break;\ + case 1:do BITPACK_1( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 2:do BITPACK_2( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 3:do BITPACK_3( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 4:do BITPACK_4( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 5:do BITPACK_5( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 6:do BITPACK_6( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 7:do BITPACK_7( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 8:do BITPACK_8( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 9:do BITPACK_9( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 10:do BITPACK_10(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 11:do BITPACK_11(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 12:do BITPACK_12(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 13:do BITPACK_13(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 14:do BITPACK_14(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 15:do BITPACK_15(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 16:do BITPACK_16(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 17:do BITPACK_17(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 18:do BITPACK_18(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 19:do BITPACK_19(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 20:do BITPACK_20(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 21:do BITPACK_21(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 22:do BITPACK_22(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 23:do BITPACK_23(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 24:do BITPACK_24(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 25:do BITPACK_25(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 26:do BITPACK_26(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 27:do BITPACK_27(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 28:do BITPACK_28(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 29:do BITPACK_29(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 30:do BITPACK_30(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 31:do BITPACK_31(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 32:do BITPACK_32(__ip, __op, __parm) while(__ip < __ipe);\ + } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\ +} while(0) + #endif +// + diff --git a/bitunpack.c b/bitunpack.c new file mode 100644 index 0000000..830ad4b --- /dev/null +++ b/bitunpack.c @@ -0,0 +1,56 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack_.h - "Integer Compression" binary packing +**/ + +#include "bitunpack.h" + +#define PAD8(__x) (((__x)+7)/8) +unsigned char * bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i; for(i=0; i < n; i++ ) out[i] = bitgetx32(in, b, i); return in + PAD8(n*b); } +unsigned char *_bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); } + +#define BPI(__w,__parm) __w +#include "bitunpack_.h" +unsigned char *bitunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; } +unsigned char *bitunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned short *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; } +#undef BPI + +//------------------------------------------------------------------------------------------ +#define BPI(__w,__parm) (__parm += (__w) + 1) +#include "bitunpack_.h" + +unsigned char *bitdunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +unsigned char *bitdunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); } +unsigned char *bitdunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +#undef BPI + +//------------------------------------------------------------------------------------------ +#define BPI(__w,__parm) (__parm + (__w) + 1) +#include "bitunpack_.h" + +unsigned char *bitfunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +unsigned char *bitfunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+1; return in + PAD8(n*b); } +unsigned char *bitfunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +#undef BPI + diff --git a/bitunpack.h b/bitunpack.h new file mode 100644 index 0000000..ff1054c --- /dev/null +++ b/bitunpack.h @@ -0,0 +1,51 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack.h - "Integer Compression" binary packing +**/ + +// BP +static inline unsigned bitgetx32(unsigned *__restrict__ in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return ((*(unsigned long long *)(in+(bidx>>5))) >> (bidx&0x1f)) & ((1ull<>5))) >> (bidx&0x1f)) & ((1ull<>4))) >> (bidx& 0xf)) & ((1 <>4))) >> (bidx& 0xf)) & ((1 <= val) { *oidx=idx; return oval; } } return INT_MAX; } + diff --git a/bitunpack64_.h b/bitunpack64_.h new file mode 100644 index 0000000..88ac332 --- /dev/null +++ b/bitunpack64_.h @@ -0,0 +1,1365 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack64_.c - "Integer Compression" binary packing +**/ + +#define BITUNBLK32_0(ip, i, op, parm) { \ + DST(op,i*0+ 0, 0, parm);\ + DST(op,i*0+ 1, 0, parm);\ + DST(op,i*0+ 2, 0, parm);\ + DST(op,i*0+ 3, 0, parm);\ + DST(op,i*0+ 4, 0, parm);\ + DST(op,i*0+ 5, 0, parm);\ + DST(op,i*0+ 6, 0, parm);\ + DST(op,i*0+ 7, 0, parm);\ + DST(op,i*0+ 8, 0, parm);\ + DST(op,i*0+ 9, 0, parm);\ + DST(op,i*0+10, 0, parm);\ + DST(op,i*0+11, 0, parm);\ + DST(op,i*0+12, 0, parm);\ + DST(op,i*0+13, 0, parm);\ + DST(op,i*0+14, 0, parm);\ + DST(op,i*0+15, 0, parm);\ + DST(op,i*0+16, 0, parm);\ + DST(op,i*0+17, 0, parm);\ + DST(op,i*0+18, 0, parm);\ + DST(op,i*0+19, 0, parm);\ + DST(op,i*0+20, 0, parm);\ + DST(op,i*0+21, 0, parm);\ + DST(op,i*0+22, 0, parm);\ + DST(op,i*0+23, 0, parm);\ + DST(op,i*0+24, 0, parm);\ + DST(op,i*0+25, 0, parm);\ + DST(op,i*0+26, 0, parm);\ + DST(op,i*0+27, 0, parm);\ + DST(op,i*0+28, 0, parm);\ + DST(op,i*0+29, 0, parm);\ + DST(op,i*0+30, 0, parm);\ + DST(op,i*0+31, 0, parm);;\ +} + +#define BITUNPACK64_0(ip, op, parm) { \ + BITUNBLK32_0(ip, 0, op, parm); DSTI(op);\ +} + +#define BITUNBLK32_1(ip, i, op, parm) { register uint32_t w0 = *(uint32_t *)(ip+(i*1+0)*4/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x1, parm);\ + DST(op,i*32+ 1, (w0 >> 1) & 0x1, parm);\ + DST(op,i*32+ 2, (w0 >> 2) & 0x1, parm);\ + DST(op,i*32+ 3, (w0 >> 3) & 0x1, parm);\ + DST(op,i*32+ 4, (w0 >> 4) & 0x1, parm);\ + DST(op,i*32+ 5, (w0 >> 5) & 0x1, parm);\ + DST(op,i*32+ 6, (w0 >> 6) & 0x1, parm);\ + DST(op,i*32+ 7, (w0 >> 7) & 0x1, parm);\ + DST(op,i*32+ 8, (w0 >> 8) & 0x1, parm);\ + DST(op,i*32+ 9, (w0 >> 9) & 0x1, parm);\ + DST(op,i*32+10, (w0 >> 10) & 0x1, parm);\ + DST(op,i*32+11, (w0 >> 11) & 0x1, parm);\ + DST(op,i*32+12, (w0 >> 12) & 0x1, parm);\ + DST(op,i*32+13, (w0 >> 13) & 0x1, parm);\ + DST(op,i*32+14, (w0 >> 14) & 0x1, parm);\ + DST(op,i*32+15, (w0 >> 15) & 0x1, parm);\ + DST(op,i*32+16, (w0 >> 16) & 0x1, parm);\ + DST(op,i*32+17, (w0 >> 17) & 0x1, parm);\ + DST(op,i*32+18, (w0 >> 18) & 0x1, parm);\ + DST(op,i*32+19, (w0 >> 19) & 0x1, parm);\ + DST(op,i*32+20, (w0 >> 20) & 0x1, parm);\ + DST(op,i*32+21, (w0 >> 21) & 0x1, parm);\ + DST(op,i*32+22, (w0 >> 22) & 0x1, parm);\ + DST(op,i*32+23, (w0 >> 23) & 0x1, parm);\ + DST(op,i*32+24, (w0 >> 24) & 0x1, parm);\ + DST(op,i*32+25, (w0 >> 25) & 0x1, parm);\ + DST(op,i*32+26, (w0 >> 26) & 0x1, parm);\ + DST(op,i*32+27, (w0 >> 27) & 0x1, parm);\ + DST(op,i*32+28, (w0 >> 28) & 0x1, parm);\ + DST(op,i*32+29, (w0 >> 29) & 0x1, parm);\ + DST(op,i*32+30, (w0 >> 30) & 0x1, parm);\ + DST(op,i*32+31, (w0 >> 31) , parm);;\ +} + +#define BITUNPACK64_1(ip, op, parm) { \ + BITUNBLK32_1(ip, 0, op, parm); DSTI(op); ip += 1*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_2(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3, parm);\ + DST(op,i*32+ 1, (w0 >> 2) & 0x3, parm);\ + DST(op,i*32+ 2, (w0 >> 4) & 0x3, parm);\ + DST(op,i*32+ 3, (w0 >> 6) & 0x3, parm);\ + DST(op,i*32+ 4, (w0 >> 8) & 0x3, parm);\ + DST(op,i*32+ 5, (w0 >> 10) & 0x3, parm);\ + DST(op,i*32+ 6, (w0 >> 12) & 0x3, parm);\ + DST(op,i*32+ 7, (w0 >> 14) & 0x3, parm);\ + DST(op,i*32+ 8, (w0 >> 16) & 0x3, parm);\ + DST(op,i*32+ 9, (w0 >> 18) & 0x3, parm);\ + DST(op,i*32+10, (w0 >> 20) & 0x3, parm);\ + DST(op,i*32+11, (w0 >> 22) & 0x3, parm);\ + DST(op,i*32+12, (w0 >> 24) & 0x3, parm);\ + DST(op,i*32+13, (w0 >> 26) & 0x3, parm);\ + DST(op,i*32+14, (w0 >> 28) & 0x3, parm);\ + DST(op,i*32+15, (w0 >> 30) & 0x3, parm);\ + DST(op,i*32+16, (w0 >> 32) & 0x3, parm);\ + DST(op,i*32+17, (w0 >> 34) & 0x3, parm);\ + DST(op,i*32+18, (w0 >> 36) & 0x3, parm);\ + DST(op,i*32+19, (w0 >> 38) & 0x3, parm);\ + DST(op,i*32+20, (w0 >> 40) & 0x3, parm);\ + DST(op,i*32+21, (w0 >> 42) & 0x3, parm);\ + DST(op,i*32+22, (w0 >> 44) & 0x3, parm);\ + DST(op,i*32+23, (w0 >> 46) & 0x3, parm);\ + DST(op,i*32+24, (w0 >> 48) & 0x3, parm);\ + DST(op,i*32+25, (w0 >> 50) & 0x3, parm);\ + DST(op,i*32+26, (w0 >> 52) & 0x3, parm);\ + DST(op,i*32+27, (w0 >> 54) & 0x3, parm);\ + DST(op,i*32+28, (w0 >> 56) & 0x3, parm);\ + DST(op,i*32+29, (w0 >> 58) & 0x3, parm);\ + DST(op,i*32+30, (w0 >> 60) & 0x3, parm);\ + DST(op,i*32+31, (w0 >> 62) , parm);;\ +} + +#define BITUNPACK64_2(ip, op, parm) { \ + BITUNBLK64_2(ip, 0, op, parm); DSTI(op); ip += 2*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));register uint32_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7, parm);\ + DST(op,i*64+ 1, (w0 >> 3) & 0x7, parm);\ + DST(op,i*64+ 2, (w0 >> 6) & 0x7, parm);\ + DST(op,i*64+ 3, (w0 >> 9) & 0x7, parm);\ + DST(op,i*64+ 4, (w0 >> 12) & 0x7, parm);\ + DST(op,i*64+ 5, (w0 >> 15) & 0x7, parm);\ + DST(op,i*64+ 6, (w0 >> 18) & 0x7, parm);\ + DST(op,i*64+ 7, (w0 >> 21) & 0x7, parm);\ + DST(op,i*64+ 8, (w0 >> 24) & 0x7, parm);\ + DST(op,i*64+ 9, (w0 >> 27) & 0x7, parm);\ + DST(op,i*64+10, (w0 >> 30) & 0x7, parm);\ + DST(op,i*64+11, (w0 >> 33) & 0x7, parm);\ + DST(op,i*64+12, (w0 >> 36) & 0x7, parm);\ + DST(op,i*64+13, (w0 >> 39) & 0x7, parm);\ + DST(op,i*64+14, (w0 >> 42) & 0x7, parm);\ + DST(op,i*64+15, (w0 >> 45) & 0x7, parm);\ + DST(op,i*64+16, (w0 >> 48) & 0x7, parm);\ + DST(op,i*64+17, (w0 >> 51) & 0x7, parm);\ + DST(op,i*64+18, (w0 >> 54) & 0x7, parm);\ + DST(op,i*64+19, (w0 >> 57) & 0x7, parm);\ + DST(op,i*64+20, (w0 >> 60) & 0x7, parm); \ +\ + DST(op,i*64+21, (w0 >> 63) | (w1 << 1) & 0x7, parm);\ + DST(op,i*64+22, (w1 >> 2) & 0x7, parm);\ + DST(op,i*64+23, (w1 >> 5) & 0x7, parm);\ + DST(op,i*64+24, (w1 >> 8) & 0x7, parm);\ + DST(op,i*64+25, (w1 >> 11) & 0x7, parm);\ + DST(op,i*64+26, (w1 >> 14) & 0x7, parm);\ + DST(op,i*64+27, (w1 >> 17) & 0x7, parm);\ + DST(op,i*64+28, (w1 >> 20) & 0x7, parm);\ + DST(op,i*64+29, (w1 >> 23) & 0x7, parm);\ + DST(op,i*64+30, (w1 >> 26) & 0x7, parm);\ + DST(op,i*64+31, (w1 >> 29) & 0x7, parm);;\ +} + +#define BITUNPACK64_3(ip, op, parm) { \ + BITUNBLK64_3(ip, 0, op, parm); DSTI(op); ip += 3*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip/*+(i*1+0)*8/sizeof(ip[0])*/);ip += 8/sizeof(ip[0]);\ + DST(op,i*16+ 0, (w0 ) & 0xf, parm);\ + DST(op,i*16+ 1, (w0 >> 4) & 0xf, parm);\ + DST(op,i*16+ 2, (w0 >> 8) & 0xf, parm);\ + DST(op,i*16+ 3, (w0 >> 12) & 0xf, parm);\ + DST(op,i*16+ 4, (w0 >> 16) & 0xf, parm);\ + DST(op,i*16+ 5, (w0 >> 20) & 0xf, parm);\ + DST(op,i*16+ 6, (w0 >> 24) & 0xf, parm);\ + DST(op,i*16+ 7, (w0 >> 28) & 0xf, parm);\ + DST(op,i*16+ 8, (w0 >> 32) & 0xf, parm);\ + DST(op,i*16+ 9, (w0 >> 36) & 0xf, parm);\ + DST(op,i*16+10, (w0 >> 40) & 0xf, parm);\ + DST(op,i*16+11, (w0 >> 44) & 0xf, parm);\ + DST(op,i*16+12, (w0 >> 48) & 0xf, parm);\ + DST(op,i*16+13, (w0 >> 52) & 0xf, parm);\ + DST(op,i*16+14, (w0 >> 56) & 0xf, parm);\ + DST(op,i*16+15, (w0 >> 60), parm);;\ +} + +#define BITUNPACK64_4(ip, op, parm) { \ + BITUNBLK64_4(ip, 0, op, parm);\ + BITUNBLK64_4(ip, 1, op, parm); DSTI(op); /*ip += 4*4/sizeof(ip[0]);*/\ +} + +#define BITUNBLK64_5(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1f, parm);\ + DST(op,i*64+ 1, (w0 >> 5) & 0x1f, parm);\ + DST(op,i*64+ 2, (w0 >> 10) & 0x1f, parm);\ + DST(op,i*64+ 3, (w0 >> 15) & 0x1f, parm);\ + DST(op,i*64+ 4, (w0 >> 20) & 0x1f, parm);\ + DST(op,i*64+ 5, (w0 >> 25) & 0x1f, parm);\ + DST(op,i*64+ 6, (w0 >> 30) & 0x1f, parm);\ + DST(op,i*64+ 7, (w0 >> 35) & 0x1f, parm);\ + DST(op,i*64+ 8, (w0 >> 40) & 0x1f, parm);\ + DST(op,i*64+ 9, (w0 >> 45) & 0x1f, parm);\ + DST(op,i*64+10, (w0 >> 50) & 0x1f, parm);\ + DST(op,i*64+11, (w0 >> 55) & 0x1f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0])); \ +\ + DST(op,i*64+12, (w0 >> 60) | (w1 << 4) & 0x1f, parm);\ + DST(op,i*64+13, (w1 >> 1) & 0x1f, parm);\ + DST(op,i*64+14, (w1 >> 6) & 0x1f, parm);\ + DST(op,i*64+15, (w1 >> 11) & 0x1f, parm);\ + DST(op,i*64+16, (w1 >> 16) & 0x1f, parm);\ + DST(op,i*64+17, (w1 >> 21) & 0x1f, parm);\ + DST(op,i*64+18, (w1 >> 26) & 0x1f, parm);\ + DST(op,i*64+19, (w1 >> 31) & 0x1f, parm);\ + DST(op,i*64+20, (w1 >> 36) & 0x1f, parm);\ + DST(op,i*64+21, (w1 >> 41) & 0x1f, parm);\ + DST(op,i*64+22, (w1 >> 46) & 0x1f, parm);\ + DST(op,i*64+23, (w1 >> 51) & 0x1f, parm);\ + DST(op,i*64+24, (w1 >> 56) & 0x1f, parm); register uint32_t w2 = *(uint32_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w1 >> 61) | (w2 << 3) & 0x1f, parm);\ + DST(op,i*64+26, (w2 >> 2) & 0x1f, parm);\ + DST(op,i*64+27, (w2 >> 7) & 0x1f, parm);\ + DST(op,i*64+28, (w2 >> 12) & 0x1f, parm);\ + DST(op,i*64+29, (w2 >> 17) & 0x1f, parm);\ + DST(op,i*64+30, (w2 >> 22) & 0x1f, parm);\ + DST(op,i*64+31, (w2 >> 27) & 0x1f, parm);;\ +} + +#define BITUNPACK64_5(ip, op, parm) { \ + BITUNBLK64_5(ip, 0, op, parm); DSTI(op); ip += 5*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_6(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3f, parm);\ + DST(op,i*32+ 1, (w0 >> 6) & 0x3f, parm);\ + DST(op,i*32+ 2, (w0 >> 12) & 0x3f, parm);\ + DST(op,i*32+ 3, (w0 >> 18) & 0x3f, parm);\ + DST(op,i*32+ 4, (w0 >> 24) & 0x3f, parm);\ + DST(op,i*32+ 5, (w0 >> 30) & 0x3f, parm);\ + DST(op,i*32+ 6, (w0 >> 36) & 0x3f, parm);\ + DST(op,i*32+ 7, (w0 >> 42) & 0x3f, parm);\ + DST(op,i*32+ 8, (w0 >> 48) & 0x3f, parm);\ + DST(op,i*32+ 9, (w0 >> 54) & 0x3f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+10, (w0 >> 60) | (w1 << 4) & 0x3f, parm);\ + DST(op,i*32+11, (w1 >> 2) & 0x3f, parm);\ + DST(op,i*32+12, (w1 >> 8) & 0x3f, parm);\ + DST(op,i*32+13, (w1 >> 14) & 0x3f, parm);\ + DST(op,i*32+14, (w1 >> 20) & 0x3f, parm);\ + DST(op,i*32+15, (w1 >> 26) & 0x3f, parm);\ + DST(op,i*32+16, (w1 >> 32) & 0x3f, parm);\ + DST(op,i*32+17, (w1 >> 38) & 0x3f, parm);\ + DST(op,i*32+18, (w1 >> 44) & 0x3f, parm);\ + DST(op,i*32+19, (w1 >> 50) & 0x3f, parm);\ + DST(op,i*32+20, (w1 >> 56) & 0x3f, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+21, (w1 >> 62) | (w2 << 2) & 0x3f, parm);\ + DST(op,i*32+22, (w2 >> 4) & 0x3f, parm);\ + DST(op,i*32+23, (w2 >> 10) & 0x3f, parm);\ + DST(op,i*32+24, (w2 >> 16) & 0x3f, parm);\ + DST(op,i*32+25, (w2 >> 22) & 0x3f, parm);\ + DST(op,i*32+26, (w2 >> 28) & 0x3f, parm);\ + DST(op,i*32+27, (w2 >> 34) & 0x3f, parm);\ + DST(op,i*32+28, (w2 >> 40) & 0x3f, parm);\ + DST(op,i*32+29, (w2 >> 46) & 0x3f, parm);\ + DST(op,i*32+30, (w2 >> 52) & 0x3f, parm);\ + DST(op,i*32+31, (w2 >> 58) , parm);;\ +} + +#define BITUNPACK64_6(ip, op, parm) { \ + BITUNBLK64_6(ip, 0, op, parm); DSTI(op); ip += 6*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_7(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7f, parm);\ + DST(op,i*64+ 1, (w0 >> 7) & 0x7f, parm);\ + DST(op,i*64+ 2, (w0 >> 14) & 0x7f, parm);\ + DST(op,i*64+ 3, (w0 >> 21) & 0x7f, parm);\ + DST(op,i*64+ 4, (w0 >> 28) & 0x7f, parm);\ + DST(op,i*64+ 5, (w0 >> 35) & 0x7f, parm);\ + DST(op,i*64+ 6, (w0 >> 42) & 0x7f, parm);\ + DST(op,i*64+ 7, (w0 >> 49) & 0x7f, parm);\ + DST(op,i*64+ 8, (w0 >> 56) & 0x7f, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w0 >> 63) | (w1 << 1) & 0x7f, parm);\ + DST(op,i*64+10, (w1 >> 6) & 0x7f, parm);\ + DST(op,i*64+11, (w1 >> 13) & 0x7f, parm);\ + DST(op,i*64+12, (w1 >> 20) & 0x7f, parm);\ + DST(op,i*64+13, (w1 >> 27) & 0x7f, parm);\ + DST(op,i*64+14, (w1 >> 34) & 0x7f, parm);\ + DST(op,i*64+15, (w1 >> 41) & 0x7f, parm);\ + DST(op,i*64+16, (w1 >> 48) & 0x7f, parm);\ + DST(op,i*64+17, (w1 >> 55) & 0x7f, parm); register uint32_t w3 = *(uint32_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w1 >> 62) | (w2 << 2) & 0x7f, parm);\ + DST(op,i*64+19, (w2 >> 5) & 0x7f, parm);\ + DST(op,i*64+20, (w2 >> 12) & 0x7f, parm);\ + DST(op,i*64+21, (w2 >> 19) & 0x7f, parm);\ + DST(op,i*64+22, (w2 >> 26) & 0x7f, parm);\ + DST(op,i*64+23, (w2 >> 33) & 0x7f, parm);\ + DST(op,i*64+24, (w2 >> 40) & 0x7f, parm);\ + DST(op,i*64+25, (w2 >> 47) & 0x7f, parm);\ + DST(op,i*64+26, (w2 >> 54) & 0x7f, parm); \ +\ + DST(op,i*64+27, (w2 >> 61) | (w3 << 3) & 0x7f, parm);\ + DST(op,i*64+28, (w3 >> 4) & 0x7f, parm);\ + DST(op,i*64+29, (w3 >> 11) & 0x7f, parm);\ + DST(op,i*64+30, (w3 >> 18) & 0x7f, parm);\ + DST(op,i*64+31, (w3 >> 25) & 0x7f, parm);;\ +} + +#define BITUNPACK64_7(ip, op, parm) { \ + BITUNBLK64_7(ip, 0, op, parm); DSTI(op); ip += 7*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_8(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ + DST(op,i*8+ 0, (w0 ) & 0xff, parm);\ + DST(op,i*8+ 1, (w0 >> 8) & 0xff, parm);\ + DST(op,i*8+ 2, (w0 >> 16) & 0xff, parm);\ + DST(op,i*8+ 3, (w0 >> 24) & 0xff, parm);\ + DST(op,i*8+ 4, (w0 >> 32) & 0xff, parm);\ + DST(op,i*8+ 5, (w0 >> 40) & 0xff, parm);\ + DST(op,i*8+ 6, (w0 >> 48) & 0xff, parm);\ + DST(op,i*8+ 7, (w0 >> 56) , parm);;\ +} + +#define BITUNPACK64_8(ip, op, parm) { \ + BITUNBLK64_8(ip, 0, op, parm);\ + BITUNBLK64_8(ip, 1, op, parm);\ + BITUNBLK64_8(ip, 2, op, parm);\ + BITUNBLK64_8(ip, 3, op, parm); DSTI(op); ip += 8*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_9(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1ff, parm);\ + DST(op,i*64+ 1, (w0 >> 9) & 0x1ff, parm);\ + DST(op,i*64+ 2, (w0 >> 18) & 0x1ff, parm);\ + DST(op,i*64+ 3, (w0 >> 27) & 0x1ff, parm);\ + DST(op,i*64+ 4, (w0 >> 36) & 0x1ff, parm);\ + DST(op,i*64+ 5, (w0 >> 45) & 0x1ff, parm);\ + DST(op,i*64+ 6, (w0 >> 54) & 0x1ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w0 >> 63) | (w1 << 1) & 0x1ff, parm);\ + DST(op,i*64+ 8, (w1 >> 8) & 0x1ff, parm);\ + DST(op,i*64+ 9, (w1 >> 17) & 0x1ff, parm);\ + DST(op,i*64+10, (w1 >> 26) & 0x1ff, parm);\ + DST(op,i*64+11, (w1 >> 35) & 0x1ff, parm);\ + DST(op,i*64+12, (w1 >> 44) & 0x1ff, parm);\ + DST(op,i*64+13, (w1 >> 53) & 0x1ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w1 >> 62) | (w2 << 2) & 0x1ff, parm);\ + DST(op,i*64+15, (w2 >> 7) & 0x1ff, parm);\ + DST(op,i*64+16, (w2 >> 16) & 0x1ff, parm);\ + DST(op,i*64+17, (w2 >> 25) & 0x1ff, parm);\ + DST(op,i*64+18, (w2 >> 34) & 0x1ff, parm);\ + DST(op,i*64+19, (w2 >> 43) & 0x1ff, parm);\ + DST(op,i*64+20, (w2 >> 52) & 0x1ff, parm); register uint32_t w4 = *(uint32_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w2 >> 61) | (w3 << 3) & 0x1ff, parm);\ + DST(op,i*64+22, (w3 >> 6) & 0x1ff, parm);\ + DST(op,i*64+23, (w3 >> 15) & 0x1ff, parm);\ + DST(op,i*64+24, (w3 >> 24) & 0x1ff, parm);\ + DST(op,i*64+25, (w3 >> 33) & 0x1ff, parm);\ + DST(op,i*64+26, (w3 >> 42) & 0x1ff, parm);\ + DST(op,i*64+27, (w3 >> 51) & 0x1ff, parm); \ +\ + DST(op,i*64+28, (w3 >> 60) | (w4 << 4) & 0x1ff, parm);\ + DST(op,i*64+29, (w4 >> 5) & 0x1ff, parm);\ + DST(op,i*64+30, (w4 >> 14) & 0x1ff, parm);\ + DST(op,i*64+31, (w4 >> 23) & 0x1ff, parm);;\ +} + +#define BITUNPACK64_9(ip, op, parm) { \ + BITUNBLK64_9(ip, 0, op, parm); DSTI(op); ip += 9*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_10(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3ff, parm);\ + DST(op,i*32+ 1, (w0 >> 10) & 0x3ff, parm);\ + DST(op,i*32+ 2, (w0 >> 20) & 0x3ff, parm);\ + DST(op,i*32+ 3, (w0 >> 30) & 0x3ff, parm);\ + DST(op,i*32+ 4, (w0 >> 40) & 0x3ff, parm);\ + DST(op,i*32+ 5, (w0 >> 50) & 0x3ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 6, (w0 >> 60) | (w1 << 4) & 0x3ff, parm);\ + DST(op,i*32+ 7, (w1 >> 6) & 0x3ff, parm);\ + DST(op,i*32+ 8, (w1 >> 16) & 0x3ff, parm);\ + DST(op,i*32+ 9, (w1 >> 26) & 0x3ff, parm);\ + DST(op,i*32+10, (w1 >> 36) & 0x3ff, parm);\ + DST(op,i*32+11, (w1 >> 46) & 0x3ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+12, (w1 >> 56) | (w2 << 8) & 0x3ff, parm);\ + DST(op,i*32+13, (w2 >> 2) & 0x3ff, parm);\ + DST(op,i*32+14, (w2 >> 12) & 0x3ff, parm);\ + DST(op,i*32+15, (w2 >> 22) & 0x3ff, parm);\ + DST(op,i*32+16, (w2 >> 32) & 0x3ff, parm);\ + DST(op,i*32+17, (w2 >> 42) & 0x3ff, parm);\ + DST(op,i*32+18, (w2 >> 52) & 0x3ff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+19, (w2 >> 62) | (w3 << 2) & 0x3ff, parm);\ + DST(op,i*32+20, (w3 >> 8) & 0x3ff, parm);\ + DST(op,i*32+21, (w3 >> 18) & 0x3ff, parm);\ + DST(op,i*32+22, (w3 >> 28) & 0x3ff, parm);\ + DST(op,i*32+23, (w3 >> 38) & 0x3ff, parm);\ + DST(op,i*32+24, (w3 >> 48) & 0x3ff, parm); \ +\ + DST(op,i*32+25, (w3 >> 58) | (w4 << 6) & 0x3ff, parm);\ + DST(op,i*32+26, (w4 >> 4) & 0x3ff, parm);\ + DST(op,i*32+27, (w4 >> 14) & 0x3ff, parm);\ + DST(op,i*32+28, (w4 >> 24) & 0x3ff, parm);\ + DST(op,i*32+29, (w4 >> 34) & 0x3ff, parm);\ + DST(op,i*32+30, (w4 >> 44) & 0x3ff, parm);\ + DST(op,i*32+31, (w4 >> 54) , parm);;\ +} + +#define BITUNPACK64_10(ip, op, parm) { \ + BITUNBLK64_10(ip, 0, op, parm); DSTI(op); ip += 10*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_11(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7ff, parm);\ + DST(op,i*64+ 1, (w0 >> 11) & 0x7ff, parm);\ + DST(op,i*64+ 2, (w0 >> 22) & 0x7ff, parm);\ + DST(op,i*64+ 3, (w0 >> 33) & 0x7ff, parm);\ + DST(op,i*64+ 4, (w0 >> 44) & 0x7ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 5, (w0 >> 55) | (w1 << 9) & 0x7ff, parm);\ + DST(op,i*64+ 6, (w1 >> 2) & 0x7ff, parm);\ + DST(op,i*64+ 7, (w1 >> 13) & 0x7ff, parm);\ + DST(op,i*64+ 8, (w1 >> 24) & 0x7ff, parm);\ + DST(op,i*64+ 9, (w1 >> 35) & 0x7ff, parm);\ + DST(op,i*64+10, (w1 >> 46) & 0x7ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w1 >> 57) | (w2 << 7) & 0x7ff, parm);\ + DST(op,i*64+12, (w2 >> 4) & 0x7ff, parm);\ + DST(op,i*64+13, (w2 >> 15) & 0x7ff, parm);\ + DST(op,i*64+14, (w2 >> 26) & 0x7ff, parm);\ + DST(op,i*64+15, (w2 >> 37) & 0x7ff, parm);\ + DST(op,i*64+16, (w2 >> 48) & 0x7ff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w2 >> 59) | (w3 << 5) & 0x7ff, parm);\ + DST(op,i*64+18, (w3 >> 6) & 0x7ff, parm);\ + DST(op,i*64+19, (w3 >> 17) & 0x7ff, parm);\ + DST(op,i*64+20, (w3 >> 28) & 0x7ff, parm);\ + DST(op,i*64+21, (w3 >> 39) & 0x7ff, parm);\ + DST(op,i*64+22, (w3 >> 50) & 0x7ff, parm); register uint32_t w5 = *(uint32_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w3 >> 61) | (w4 << 3) & 0x7ff, parm);\ + DST(op,i*64+24, (w4 >> 8) & 0x7ff, parm);\ + DST(op,i*64+25, (w4 >> 19) & 0x7ff, parm);\ + DST(op,i*64+26, (w4 >> 30) & 0x7ff, parm);\ + DST(op,i*64+27, (w4 >> 41) & 0x7ff, parm);\ + DST(op,i*64+28, (w4 >> 52) & 0x7ff, parm); \ +\ + DST(op,i*64+29, (w4 >> 63) | (w5 << 1) & 0x7ff, parm);\ + DST(op,i*64+30, (w5 >> 10) & 0x7ff, parm);\ + DST(op,i*64+31, (w5 >> 21) & 0x7ff, parm);;\ +} + +#define BITUNPACK64_11(ip, op, parm) { \ + BITUNBLK64_11(ip, 0, op, parm); DSTI(op); ip += 11*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_12(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (w0 ) & 0xfff, parm);\ + DST(op,i*16+ 1, (w0 >> 12) & 0xfff, parm);\ + DST(op,i*16+ 2, (w0 >> 24) & 0xfff, parm);\ + DST(op,i*16+ 3, (w0 >> 36) & 0xfff, parm);\ + DST(op,i*16+ 4, (w0 >> 48) & 0xfff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 5, (w0 >> 60) | (w1 << 4) & 0xfff, parm);\ + DST(op,i*16+ 6, (w1 >> 8) & 0xfff, parm);\ + DST(op,i*16+ 7, (w1 >> 20) & 0xfff, parm);\ + DST(op,i*16+ 8, (w1 >> 32) & 0xfff, parm);\ + DST(op,i*16+ 9, (w1 >> 44) & 0xfff, parm); \ +\ + DST(op,i*16+10, (w1 >> 56) | (w2 << 8) & 0xfff, parm);\ + DST(op,i*16+11, (w2 >> 4) & 0xfff, parm);\ + DST(op,i*16+12, (w2 >> 16) & 0xfff, parm);\ + DST(op,i*16+13, (w2 >> 28) & 0xfff, parm);\ + DST(op,i*16+14, (w2 >> 40) & 0xfff, parm);\ + DST(op,i*16+15, (w2 >> 52) , parm);;\ +} + +#define BITUNPACK64_12(ip, op, parm) { \ + BITUNBLK64_12(ip, 0, op, parm);\ + BITUNBLK64_12(ip, 1, op, parm); DSTI(op); ip += 12*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_13(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1fff, parm);\ + DST(op,i*64+ 1, (w0 >> 13) & 0x1fff, parm);\ + DST(op,i*64+ 2, (w0 >> 26) & 0x1fff, parm);\ + DST(op,i*64+ 3, (w0 >> 39) & 0x1fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w0 >> 52) | (w1 << 12) & 0x1fff, parm);\ + DST(op,i*64+ 5, (w1 >> 1) & 0x1fff, parm);\ + DST(op,i*64+ 6, (w1 >> 14) & 0x1fff, parm);\ + DST(op,i*64+ 7, (w1 >> 27) & 0x1fff, parm);\ + DST(op,i*64+ 8, (w1 >> 40) & 0x1fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w1 >> 53) | (w2 << 11) & 0x1fff, parm);\ + DST(op,i*64+10, (w2 >> 2) & 0x1fff, parm);\ + DST(op,i*64+11, (w2 >> 15) & 0x1fff, parm);\ + DST(op,i*64+12, (w2 >> 28) & 0x1fff, parm);\ + DST(op,i*64+13, (w2 >> 41) & 0x1fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w2 >> 54) | (w3 << 10) & 0x1fff, parm);\ + DST(op,i*64+15, (w3 >> 3) & 0x1fff, parm);\ + DST(op,i*64+16, (w3 >> 16) & 0x1fff, parm);\ + DST(op,i*64+17, (w3 >> 29) & 0x1fff, parm);\ + DST(op,i*64+18, (w3 >> 42) & 0x1fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+19, (w3 >> 55) | (w4 << 9) & 0x1fff, parm);\ + DST(op,i*64+20, (w4 >> 4) & 0x1fff, parm);\ + DST(op,i*64+21, (w4 >> 17) & 0x1fff, parm);\ + DST(op,i*64+22, (w4 >> 30) & 0x1fff, parm);\ + DST(op,i*64+23, (w4 >> 43) & 0x1fff, parm); register uint32_t w6 = *(uint32_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w4 >> 56) | (w5 << 8) & 0x1fff, parm);\ + DST(op,i*64+25, (w5 >> 5) & 0x1fff, parm);\ + DST(op,i*64+26, (w5 >> 18) & 0x1fff, parm);\ + DST(op,i*64+27, (w5 >> 31) & 0x1fff, parm);\ + DST(op,i*64+28, (w5 >> 44) & 0x1fff, parm); \ +\ + DST(op,i*64+29, (w5 >> 57) | (w6 << 7) & 0x1fff, parm);\ + DST(op,i*64+30, (w6 >> 6) & 0x1fff, parm);\ + DST(op,i*64+31, (w6 >> 19) & 0x1fff, parm);;\ +} + +#define BITUNPACK64_13(ip, op, parm) { \ + BITUNBLK64_13(ip, 0, op, parm); DSTI(op); ip += 13*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_14(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3fff, parm);\ + DST(op,i*32+ 1, (w0 >> 14) & 0x3fff, parm);\ + DST(op,i*32+ 2, (w0 >> 28) & 0x3fff, parm);\ + DST(op,i*32+ 3, (w0 >> 42) & 0x3fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 4, (w0 >> 56) | (w1 << 8) & 0x3fff, parm);\ + DST(op,i*32+ 5, (w1 >> 6) & 0x3fff, parm);\ + DST(op,i*32+ 6, (w1 >> 20) & 0x3fff, parm);\ + DST(op,i*32+ 7, (w1 >> 34) & 0x3fff, parm);\ + DST(op,i*32+ 8, (w1 >> 48) & 0x3fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 9, (w1 >> 62) | (w2 << 2) & 0x3fff, parm);\ + DST(op,i*32+10, (w2 >> 12) & 0x3fff, parm);\ + DST(op,i*32+11, (w2 >> 26) & 0x3fff, parm);\ + DST(op,i*32+12, (w2 >> 40) & 0x3fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+13, (w2 >> 54) | (w3 << 10) & 0x3fff, parm);\ + DST(op,i*32+14, (w3 >> 4) & 0x3fff, parm);\ + DST(op,i*32+15, (w3 >> 18) & 0x3fff, parm);\ + DST(op,i*32+16, (w3 >> 32) & 0x3fff, parm);\ + DST(op,i*32+17, (w3 >> 46) & 0x3fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+18, (w3 >> 60) | (w4 << 4) & 0x3fff, parm);\ + DST(op,i*32+19, (w4 >> 10) & 0x3fff, parm);\ + DST(op,i*32+20, (w4 >> 24) & 0x3fff, parm);\ + DST(op,i*32+21, (w4 >> 38) & 0x3fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+22, (w4 >> 52) | (w5 << 12) & 0x3fff, parm);\ + DST(op,i*32+23, (w5 >> 2) & 0x3fff, parm);\ + DST(op,i*32+24, (w5 >> 16) & 0x3fff, parm);\ + DST(op,i*32+25, (w5 >> 30) & 0x3fff, parm);\ + DST(op,i*32+26, (w5 >> 44) & 0x3fff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+27, (w5 >> 58) | (w6 << 6) & 0x3fff, parm);\ + DST(op,i*32+28, (w6 >> 8) & 0x3fff, parm);\ + DST(op,i*32+29, (w6 >> 22) & 0x3fff, parm);\ + DST(op,i*32+30, (w6 >> 36) & 0x3fff, parm);\ + DST(op,i*32+31, (w6 >> 50) , parm);;\ +} + +#define BITUNPACK64_14(ip, op, parm) { \ + BITUNBLK64_14(ip, 0, op, parm); DSTI(op); ip += 14*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_15(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7fff, parm);\ + DST(op,i*64+ 1, (w0 >> 15) & 0x7fff, parm);\ + DST(op,i*64+ 2, (w0 >> 30) & 0x7fff, parm);\ + DST(op,i*64+ 3, (w0 >> 45) & 0x7fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w0 >> 60) | (w1 << 4) & 0x7fff, parm);\ + DST(op,i*64+ 5, (w1 >> 11) & 0x7fff, parm);\ + DST(op,i*64+ 6, (w1 >> 26) & 0x7fff, parm);\ + DST(op,i*64+ 7, (w1 >> 41) & 0x7fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w1 >> 56) | (w2 << 8) & 0x7fff, parm);\ + DST(op,i*64+ 9, (w2 >> 7) & 0x7fff, parm);\ + DST(op,i*64+10, (w2 >> 22) & 0x7fff, parm);\ + DST(op,i*64+11, (w2 >> 37) & 0x7fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w2 >> 52) | (w3 << 12) & 0x7fff, parm);\ + DST(op,i*64+13, (w3 >> 3) & 0x7fff, parm);\ + DST(op,i*64+14, (w3 >> 18) & 0x7fff, parm);\ + DST(op,i*64+15, (w3 >> 33) & 0x7fff, parm);\ + DST(op,i*64+16, (w3 >> 48) & 0x7fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w3 >> 63) | (w4 << 1) & 0x7fff, parm);\ + DST(op,i*64+18, (w4 >> 14) & 0x7fff, parm);\ + DST(op,i*64+19, (w4 >> 29) & 0x7fff, parm);\ + DST(op,i*64+20, (w4 >> 44) & 0x7fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w4 >> 59) | (w5 << 5) & 0x7fff, parm);\ + DST(op,i*64+22, (w5 >> 10) & 0x7fff, parm);\ + DST(op,i*64+23, (w5 >> 25) & 0x7fff, parm);\ + DST(op,i*64+24, (w5 >> 40) & 0x7fff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w5 >> 55) | (w6 << 9) & 0x7fff, parm);\ + DST(op,i*64+26, (w6 >> 6) & 0x7fff, parm);\ + DST(op,i*64+27, (w6 >> 21) & 0x7fff, parm);\ + DST(op,i*64+28, (w6 >> 36) & 0x7fff, parm); register uint32_t w7 = *(uint32_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+29, (w6 >> 51) | (w7 << 13) & 0x7fff, parm);\ + DST(op,i*64+30, (w7 >> 2) & 0x7fff, parm);\ + DST(op,i*64+31, (w7 >> 17) & 0x7fff, parm);;\ +} + +#define BITUNPACK64_15(ip, op, parm) { \ + BITUNBLK64_15(ip, 0, op, parm); DSTI(op); ip += 15*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_16(ip, i, op, parm) { \ + DST(op,i*4+ 0, *(uint16_t *)(ip+i*8+ 0), parm);\ + DST(op,i*4+ 1, *(uint16_t *)(ip+i*8+ 2), parm);\ + DST(op,i*4+ 2, *(uint16_t *)(ip+i*8+ 4), parm);\ + DST(op,i*4+ 3, *(uint16_t *)(ip+i*8+ 6), parm);;\ +} + +#define BITUNPACK64_16(ip, op, parm) { \ + BITUNBLK64_16(ip, 0, op, parm);\ + BITUNBLK64_16(ip, 1, op, parm);\ + BITUNBLK64_16(ip, 2, op, parm);\ + BITUNBLK64_16(ip, 3, op, parm);\ + BITUNBLK64_16(ip, 4, op, parm);\ + BITUNBLK64_16(ip, 5, op, parm);\ + BITUNBLK64_16(ip, 6, op, parm);\ + BITUNBLK64_16(ip, 7, op, parm); DSTI(op); ip += 16*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_17(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*17+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1ffff, parm);\ + DST(op,i*64+ 1, (w0 >> 17) & 0x1ffff, parm);\ + DST(op,i*64+ 2, (w0 >> 34) & 0x1ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*17+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 3, (w0 >> 51) | (w1 << 13) & 0x1ffff, parm);\ + DST(op,i*64+ 4, (w1 >> 4) & 0x1ffff, parm);\ + DST(op,i*64+ 5, (w1 >> 21) & 0x1ffff, parm);\ + DST(op,i*64+ 6, (w1 >> 38) & 0x1ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*17+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w1 >> 55) | (w2 << 9) & 0x1ffff, parm);\ + DST(op,i*64+ 8, (w2 >> 8) & 0x1ffff, parm);\ + DST(op,i*64+ 9, (w2 >> 25) & 0x1ffff, parm);\ + DST(op,i*64+10, (w2 >> 42) & 0x1ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*17+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w2 >> 59) | (w3 << 5) & 0x1ffff, parm);\ + DST(op,i*64+12, (w3 >> 12) & 0x1ffff, parm);\ + DST(op,i*64+13, (w3 >> 29) & 0x1ffff, parm);\ + DST(op,i*64+14, (w3 >> 46) & 0x1ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*17+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w3 >> 63) | (w4 << 1) & 0x1ffff, parm);\ + DST(op,i*64+16, (w4 >> 16) & 0x1ffff, parm);\ + DST(op,i*64+17, (w4 >> 33) & 0x1ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*17+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w4 >> 50) | (w5 << 14) & 0x1ffff, parm);\ + DST(op,i*64+19, (w5 >> 3) & 0x1ffff, parm);\ + DST(op,i*64+20, (w5 >> 20) & 0x1ffff, parm);\ + DST(op,i*64+21, (w5 >> 37) & 0x1ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*17+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w5 >> 54) | (w6 << 10) & 0x1ffff, parm);\ + DST(op,i*64+23, (w6 >> 7) & 0x1ffff, parm);\ + DST(op,i*64+24, (w6 >> 24) & 0x1ffff, parm);\ + DST(op,i*64+25, (w6 >> 41) & 0x1ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*17+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w6 >> 58) | (w7 << 6) & 0x1ffff, parm);\ + DST(op,i*64+27, (w7 >> 11) & 0x1ffff, parm);\ + DST(op,i*64+28, (w7 >> 28) & 0x1ffff, parm);\ + DST(op,i*64+29, (w7 >> 45) & 0x1ffff, parm); register uint32_t w8 = *(uint32_t *)(ip+(i*17+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w7 >> 62) | (w8 << 2) & 0x1ffff, parm);\ + DST(op,i*64+31, (w8 >> 15) & 0x1ffff, parm);;\ +} + +#define BITUNPACK64_17(ip, op, parm) { \ + BITUNBLK64_17(ip, 0, op, parm); DSTI(op); ip += 17*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_18(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3ffff, parm);\ + DST(op,i*32+ 1, (w0 >> 18) & 0x3ffff, parm);\ + DST(op,i*32+ 2, (w0 >> 36) & 0x3ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 3, (w0 >> 54) | (w1 << 10) & 0x3ffff, parm);\ + DST(op,i*32+ 4, (w1 >> 8) & 0x3ffff, parm);\ + DST(op,i*32+ 5, (w1 >> 26) & 0x3ffff, parm);\ + DST(op,i*32+ 6, (w1 >> 44) & 0x3ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 7, (w1 >> 62) | (w2 << 2) & 0x3ffff, parm);\ + DST(op,i*32+ 8, (w2 >> 16) & 0x3ffff, parm);\ + DST(op,i*32+ 9, (w2 >> 34) & 0x3ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+10, (w2 >> 52) | (w3 << 12) & 0x3ffff, parm);\ + DST(op,i*32+11, (w3 >> 6) & 0x3ffff, parm);\ + DST(op,i*32+12, (w3 >> 24) & 0x3ffff, parm);\ + DST(op,i*32+13, (w3 >> 42) & 0x3ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w3 >> 60) | (w4 << 4) & 0x3ffff, parm);\ + DST(op,i*32+15, (w4 >> 14) & 0x3ffff, parm);\ + DST(op,i*32+16, (w4 >> 32) & 0x3ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*9+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w4 >> 50) | (w5 << 14) & 0x3ffff, parm);\ + DST(op,i*32+18, (w5 >> 4) & 0x3ffff, parm);\ + DST(op,i*32+19, (w5 >> 22) & 0x3ffff, parm);\ + DST(op,i*32+20, (w5 >> 40) & 0x3ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*9+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+21, (w5 >> 58) | (w6 << 6) & 0x3ffff, parm);\ + DST(op,i*32+22, (w6 >> 12) & 0x3ffff, parm);\ + DST(op,i*32+23, (w6 >> 30) & 0x3ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*9+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+24, (w6 >> 48) | (w7 << 16) & 0x3ffff, parm);\ + DST(op,i*32+25, (w7 >> 2) & 0x3ffff, parm);\ + DST(op,i*32+26, (w7 >> 20) & 0x3ffff, parm);\ + DST(op,i*32+27, (w7 >> 38) & 0x3ffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*9+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+28, (w7 >> 56) | (w8 << 8) & 0x3ffff, parm);\ + DST(op,i*32+29, (w8 >> 10) & 0x3ffff, parm);\ + DST(op,i*32+30, (w8 >> 28) & 0x3ffff, parm);\ + DST(op,i*32+31, (w8 >> 46) , parm);;\ +} + +#define BITUNPACK64_18(ip, op, parm) { \ + BITUNBLK64_18(ip, 0, op, parm); DSTI(op); ip += 18*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_19(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*19+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7ffff, parm);\ + DST(op,i*64+ 1, (w0 >> 19) & 0x7ffff, parm);\ + DST(op,i*64+ 2, (w0 >> 38) & 0x7ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*19+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 3, (w0 >> 57) | (w1 << 7) & 0x7ffff, parm);\ + DST(op,i*64+ 4, (w1 >> 12) & 0x7ffff, parm);\ + DST(op,i*64+ 5, (w1 >> 31) & 0x7ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*19+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w1 >> 50) | (w2 << 14) & 0x7ffff, parm);\ + DST(op,i*64+ 7, (w2 >> 5) & 0x7ffff, parm);\ + DST(op,i*64+ 8, (w2 >> 24) & 0x7ffff, parm);\ + DST(op,i*64+ 9, (w2 >> 43) & 0x7ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*19+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+10, (w2 >> 62) | (w3 << 2) & 0x7ffff, parm);\ + DST(op,i*64+11, (w3 >> 17) & 0x7ffff, parm);\ + DST(op,i*64+12, (w3 >> 36) & 0x7ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*19+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+13, (w3 >> 55) | (w4 << 9) & 0x7ffff, parm);\ + DST(op,i*64+14, (w4 >> 10) & 0x7ffff, parm);\ + DST(op,i*64+15, (w4 >> 29) & 0x7ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*19+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w4 >> 48) | (w5 << 16) & 0x7ffff, parm);\ + DST(op,i*64+17, (w5 >> 3) & 0x7ffff, parm);\ + DST(op,i*64+18, (w5 >> 22) & 0x7ffff, parm);\ + DST(op,i*64+19, (w5 >> 41) & 0x7ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*19+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+20, (w5 >> 60) | (w6 << 4) & 0x7ffff, parm);\ + DST(op,i*64+21, (w6 >> 15) & 0x7ffff, parm);\ + DST(op,i*64+22, (w6 >> 34) & 0x7ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*19+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w6 >> 53) | (w7 << 11) & 0x7ffff, parm);\ + DST(op,i*64+24, (w7 >> 8) & 0x7ffff, parm);\ + DST(op,i*64+25, (w7 >> 27) & 0x7ffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*19+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w7 >> 46) | (w8 << 18) & 0x7ffff, parm);\ + DST(op,i*64+27, (w8 >> 1) & 0x7ffff, parm);\ + DST(op,i*64+28, (w8 >> 20) & 0x7ffff, parm);\ + DST(op,i*64+29, (w8 >> 39) & 0x7ffff, parm); register uint32_t w9 = *(uint32_t *)(ip+(i*19+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w8 >> 58) | (w9 << 6) & 0x7ffff, parm);\ + DST(op,i*64+31, (w9 >> 13) & 0x7ffff, parm);;\ +} + +#define BITUNPACK64_19(ip, op, parm) { \ + BITUNBLK64_19(ip, 0, op, parm); DSTI(op); ip += 19*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_20(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (w0 ) & 0xfffff, parm);\ + DST(op,i*16+ 1, (w0 >> 20) & 0xfffff, parm);\ + DST(op,i*16+ 2, (w0 >> 40) & 0xfffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 3, (w0 >> 60) | (w1 << 4) & 0xfffff, parm);\ + DST(op,i*16+ 4, (w1 >> 16) & 0xfffff, parm);\ + DST(op,i*16+ 5, (w1 >> 36) & 0xfffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 6, (w1 >> 56) | (w2 << 8) & 0xfffff, parm);\ + DST(op,i*16+ 7, (w2 >> 12) & 0xfffff, parm);\ + DST(op,i*16+ 8, (w2 >> 32) & 0xfffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 9, (w2 >> 52) | (w3 << 12) & 0xfffff, parm);\ + DST(op,i*16+10, (w3 >> 8) & 0xfffff, parm);\ + DST(op,i*16+11, (w3 >> 28) & 0xfffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+12, (w3 >> 48) | (w4 << 16) & 0xfffff, parm);\ + DST(op,i*16+13, (w4 >> 4) & 0xfffff, parm);\ + DST(op,i*16+14, (w4 >> 24) & 0xfffff, parm);\ + DST(op,i*16+15, (w4 >> 44) , parm);;\ +} + +#define BITUNPACK64_20(ip, op, parm) { \ + BITUNBLK64_20(ip, 0, op, parm);\ + BITUNBLK64_20(ip, 1, op, parm); DSTI(op); ip += 20*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_21(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*21+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1fffff, parm);\ + DST(op,i*64+ 1, (w0 >> 21) & 0x1fffff, parm);\ + DST(op,i*64+ 2, (w0 >> 42) & 0x1fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*21+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 3, (w0 >> 63) | (w1 << 1) & 0x1fffff, parm);\ + DST(op,i*64+ 4, (w1 >> 20) & 0x1fffff, parm);\ + DST(op,i*64+ 5, (w1 >> 41) & 0x1fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*21+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w1 >> 62) | (w2 << 2) & 0x1fffff, parm);\ + DST(op,i*64+ 7, (w2 >> 19) & 0x1fffff, parm);\ + DST(op,i*64+ 8, (w2 >> 40) & 0x1fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*21+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w2 >> 61) | (w3 << 3) & 0x1fffff, parm);\ + DST(op,i*64+10, (w3 >> 18) & 0x1fffff, parm);\ + DST(op,i*64+11, (w3 >> 39) & 0x1fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*21+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w3 >> 60) | (w4 << 4) & 0x1fffff, parm);\ + DST(op,i*64+13, (w4 >> 17) & 0x1fffff, parm);\ + DST(op,i*64+14, (w4 >> 38) & 0x1fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*21+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w4 >> 59) | (w5 << 5) & 0x1fffff, parm);\ + DST(op,i*64+16, (w5 >> 16) & 0x1fffff, parm);\ + DST(op,i*64+17, (w5 >> 37) & 0x1fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*21+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w5 >> 58) | (w6 << 6) & 0x1fffff, parm);\ + DST(op,i*64+19, (w6 >> 15) & 0x1fffff, parm);\ + DST(op,i*64+20, (w6 >> 36) & 0x1fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*21+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w6 >> 57) | (w7 << 7) & 0x1fffff, parm);\ + DST(op,i*64+22, (w7 >> 14) & 0x1fffff, parm);\ + DST(op,i*64+23, (w7 >> 35) & 0x1fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*21+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w7 >> 56) | (w8 << 8) & 0x1fffff, parm);\ + DST(op,i*64+25, (w8 >> 13) & 0x1fffff, parm);\ + DST(op,i*64+26, (w8 >> 34) & 0x1fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*21+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+27, (w8 >> 55) | (w9 << 9) & 0x1fffff, parm);\ + DST(op,i*64+28, (w9 >> 12) & 0x1fffff, parm);\ + DST(op,i*64+29, (w9 >> 33) & 0x1fffff, parm); register uint32_t w10 = *(uint32_t *)(ip+(i*21+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w9 >> 54) | (w10 << 10) & 0x1fffff, parm);\ + DST(op,i*64+31, (w10 >> 11) & 0x1fffff, parm);;\ +} + +#define BITUNPACK64_21(ip, op, parm) { \ + BITUNBLK64_21(ip, 0, op, parm); DSTI(op); ip += 21*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_22(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3fffff, parm);\ + DST(op,i*32+ 1, (w0 >> 22) & 0x3fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 2, (w0 >> 44) | (w1 << 20) & 0x3fffff, parm);\ + DST(op,i*32+ 3, (w1 >> 2) & 0x3fffff, parm);\ + DST(op,i*32+ 4, (w1 >> 24) & 0x3fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 5, (w1 >> 46) | (w2 << 18) & 0x3fffff, parm);\ + DST(op,i*32+ 6, (w2 >> 4) & 0x3fffff, parm);\ + DST(op,i*32+ 7, (w2 >> 26) & 0x3fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 8, (w2 >> 48) | (w3 << 16) & 0x3fffff, parm);\ + DST(op,i*32+ 9, (w3 >> 6) & 0x3fffff, parm);\ + DST(op,i*32+10, (w3 >> 28) & 0x3fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+11, (w3 >> 50) | (w4 << 14) & 0x3fffff, parm);\ + DST(op,i*32+12, (w4 >> 8) & 0x3fffff, parm);\ + DST(op,i*32+13, (w4 >> 30) & 0x3fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w4 >> 52) | (w5 << 12) & 0x3fffff, parm);\ + DST(op,i*32+15, (w5 >> 10) & 0x3fffff, parm);\ + DST(op,i*32+16, (w5 >> 32) & 0x3fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*11+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w5 >> 54) | (w6 << 10) & 0x3fffff, parm);\ + DST(op,i*32+18, (w6 >> 12) & 0x3fffff, parm);\ + DST(op,i*32+19, (w6 >> 34) & 0x3fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*11+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+20, (w6 >> 56) | (w7 << 8) & 0x3fffff, parm);\ + DST(op,i*32+21, (w7 >> 14) & 0x3fffff, parm);\ + DST(op,i*32+22, (w7 >> 36) & 0x3fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*11+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+23, (w7 >> 58) | (w8 << 6) & 0x3fffff, parm);\ + DST(op,i*32+24, (w8 >> 16) & 0x3fffff, parm);\ + DST(op,i*32+25, (w8 >> 38) & 0x3fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*11+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+26, (w8 >> 60) | (w9 << 4) & 0x3fffff, parm);\ + DST(op,i*32+27, (w9 >> 18) & 0x3fffff, parm);\ + DST(op,i*32+28, (w9 >> 40) & 0x3fffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*11+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+29, (w9 >> 62) | (w10 << 2) & 0x3fffff, parm);\ + DST(op,i*32+30, (w10 >> 20) & 0x3fffff, parm);\ + DST(op,i*32+31, (w10 >> 42) , parm);;\ +} + +#define BITUNPACK64_22(ip, op, parm) { \ + BITUNBLK64_22(ip, 0, op, parm); DSTI(op); ip += 22*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_23(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*23+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7fffff, parm);\ + DST(op,i*64+ 1, (w0 >> 23) & 0x7fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*23+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 46) | (w1 << 18) & 0x7fffff, parm);\ + DST(op,i*64+ 3, (w1 >> 5) & 0x7fffff, parm);\ + DST(op,i*64+ 4, (w1 >> 28) & 0x7fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*23+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 5, (w1 >> 51) | (w2 << 13) & 0x7fffff, parm);\ + DST(op,i*64+ 6, (w2 >> 10) & 0x7fffff, parm);\ + DST(op,i*64+ 7, (w2 >> 33) & 0x7fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*23+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w2 >> 56) | (w3 << 8) & 0x7fffff, parm);\ + DST(op,i*64+ 9, (w3 >> 15) & 0x7fffff, parm);\ + DST(op,i*64+10, (w3 >> 38) & 0x7fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*23+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w3 >> 61) | (w4 << 3) & 0x7fffff, parm);\ + DST(op,i*64+12, (w4 >> 20) & 0x7fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*23+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+13, (w4 >> 43) | (w5 << 21) & 0x7fffff, parm);\ + DST(op,i*64+14, (w5 >> 2) & 0x7fffff, parm);\ + DST(op,i*64+15, (w5 >> 25) & 0x7fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*23+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w5 >> 48) | (w6 << 16) & 0x7fffff, parm);\ + DST(op,i*64+17, (w6 >> 7) & 0x7fffff, parm);\ + DST(op,i*64+18, (w6 >> 30) & 0x7fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*23+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+19, (w6 >> 53) | (w7 << 11) & 0x7fffff, parm);\ + DST(op,i*64+20, (w7 >> 12) & 0x7fffff, parm);\ + DST(op,i*64+21, (w7 >> 35) & 0x7fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*23+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w7 >> 58) | (w8 << 6) & 0x7fffff, parm);\ + DST(op,i*64+23, (w8 >> 17) & 0x7fffff, parm);\ + DST(op,i*64+24, (w8 >> 40) & 0x7fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*23+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w8 >> 63) | (w9 << 1) & 0x7fffff, parm);\ + DST(op,i*64+26, (w9 >> 22) & 0x7fffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*23+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+27, (w9 >> 45) | (w10 << 19) & 0x7fffff, parm);\ + DST(op,i*64+28, (w10 >> 4) & 0x7fffff, parm);\ + DST(op,i*64+29, (w10 >> 27) & 0x7fffff, parm); register uint32_t w11 = *(uint32_t *)(ip+(i*23+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w10 >> 50) | (w11 << 14) & 0x7fffff, parm);\ + DST(op,i*64+31, (w11 >> 9) & 0x7fffff, parm);;\ +} + +#define BITUNPACK64_23(ip, op, parm) { \ + BITUNBLK64_23(ip, 0, op, parm); DSTI(op); ip += 23*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_24(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ + DST(op,i*8+ 0, (w0 ) & 0xffffff, parm);\ + DST(op,i*8+ 1, (w0 >> 24) & 0xffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*8+ 2, (w0 >> 48) | (w1 << 16) & 0xffffff, parm);\ + DST(op,i*8+ 3, (w1 >> 8) & 0xffffff, parm);\ + DST(op,i*8+ 4, (w1 >> 32) & 0xffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*8+ 5, (w1 >> 56) | (w2 << 8) & 0xffffff, parm);\ + DST(op,i*8+ 6, (w2 >> 16) & 0xffffff, parm);\ + DST(op,i*8+ 7, (w2 >> 40) , parm);;\ +} + +#define BITUNPACK64_24(ip, op, parm) { \ + BITUNBLK64_24(ip, 0, op, parm);\ + BITUNBLK64_24(ip, 1, op, parm);\ + BITUNBLK64_24(ip, 2, op, parm);\ + BITUNBLK64_24(ip, 3, op, parm); DSTI(op); ip += 24*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_25(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*25+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1ffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 25) & 0x1ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*25+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 50) | (w1 << 14) & 0x1ffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 11) & 0x1ffffff, parm);\ + DST(op,i*64+ 4, (w1 >> 36) & 0x1ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*25+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 5, (w1 >> 61) | (w2 << 3) & 0x1ffffff, parm);\ + DST(op,i*64+ 6, (w2 >> 22) & 0x1ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*25+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w2 >> 47) | (w3 << 17) & 0x1ffffff, parm);\ + DST(op,i*64+ 8, (w3 >> 8) & 0x1ffffff, parm);\ + DST(op,i*64+ 9, (w3 >> 33) & 0x1ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*25+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+10, (w3 >> 58) | (w4 << 6) & 0x1ffffff, parm);\ + DST(op,i*64+11, (w4 >> 19) & 0x1ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*25+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w4 >> 44) | (w5 << 20) & 0x1ffffff, parm);\ + DST(op,i*64+13, (w5 >> 5) & 0x1ffffff, parm);\ + DST(op,i*64+14, (w5 >> 30) & 0x1ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*25+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w5 >> 55) | (w6 << 9) & 0x1ffffff, parm);\ + DST(op,i*64+16, (w6 >> 16) & 0x1ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*25+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w6 >> 41) | (w7 << 23) & 0x1ffffff, parm);\ + DST(op,i*64+18, (w7 >> 2) & 0x1ffffff, parm);\ + DST(op,i*64+19, (w7 >> 27) & 0x1ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*25+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+20, (w7 >> 52) | (w8 << 12) & 0x1ffffff, parm);\ + DST(op,i*64+21, (w8 >> 13) & 0x1ffffff, parm);\ + DST(op,i*64+22, (w8 >> 38) & 0x1ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*25+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w8 >> 63) | (w9 << 1) & 0x1ffffff, parm);\ + DST(op,i*64+24, (w9 >> 24) & 0x1ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*25+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w9 >> 49) | (w10 << 15) & 0x1ffffff, parm);\ + DST(op,i*64+26, (w10 >> 10) & 0x1ffffff, parm);\ + DST(op,i*64+27, (w10 >> 35) & 0x1ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*25+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w10 >> 60) | (w11 << 4) & 0x1ffffff, parm);\ + DST(op,i*64+29, (w11 >> 21) & 0x1ffffff, parm); register uint32_t w12 = *(uint32_t *)(ip+(i*25+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w11 >> 46) | (w12 << 18) & 0x1ffffff, parm);\ + DST(op,i*64+31, (w12 >> 7) & 0x1ffffff, parm);;\ +} + +#define BITUNPACK64_25(ip, op, parm) { \ + BITUNBLK64_25(ip, 0, op, parm); DSTI(op); ip += 25*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_26(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3ffffff, parm);\ + DST(op,i*32+ 1, (w0 >> 26) & 0x3ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 2, (w0 >> 52) | (w1 << 12) & 0x3ffffff, parm);\ + DST(op,i*32+ 3, (w1 >> 14) & 0x3ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 4, (w1 >> 40) | (w2 << 24) & 0x3ffffff, parm);\ + DST(op,i*32+ 5, (w2 >> 2) & 0x3ffffff, parm);\ + DST(op,i*32+ 6, (w2 >> 28) & 0x3ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 7, (w2 >> 54) | (w3 << 10) & 0x3ffffff, parm);\ + DST(op,i*32+ 8, (w3 >> 16) & 0x3ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 9, (w3 >> 42) | (w4 << 22) & 0x3ffffff, parm);\ + DST(op,i*32+10, (w4 >> 4) & 0x3ffffff, parm);\ + DST(op,i*32+11, (w4 >> 30) & 0x3ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+12, (w4 >> 56) | (w5 << 8) & 0x3ffffff, parm);\ + DST(op,i*32+13, (w5 >> 18) & 0x3ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w5 >> 44) | (w6 << 20) & 0x3ffffff, parm);\ + DST(op,i*32+15, (w6 >> 6) & 0x3ffffff, parm);\ + DST(op,i*32+16, (w6 >> 32) & 0x3ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*13+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w6 >> 58) | (w7 << 6) & 0x3ffffff, parm);\ + DST(op,i*32+18, (w7 >> 20) & 0x3ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*13+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+19, (w7 >> 46) | (w8 << 18) & 0x3ffffff, parm);\ + DST(op,i*32+20, (w8 >> 8) & 0x3ffffff, parm);\ + DST(op,i*32+21, (w8 >> 34) & 0x3ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*13+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+22, (w8 >> 60) | (w9 << 4) & 0x3ffffff, parm);\ + DST(op,i*32+23, (w9 >> 22) & 0x3ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*13+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+24, (w9 >> 48) | (w10 << 16) & 0x3ffffff, parm);\ + DST(op,i*32+25, (w10 >> 10) & 0x3ffffff, parm);\ + DST(op,i*32+26, (w10 >> 36) & 0x3ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*13+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+27, (w10 >> 62) | (w11 << 2) & 0x3ffffff, parm);\ + DST(op,i*32+28, (w11 >> 24) & 0x3ffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*13+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+29, (w11 >> 50) | (w12 << 14) & 0x3ffffff, parm);\ + DST(op,i*32+30, (w12 >> 12) & 0x3ffffff, parm);\ + DST(op,i*32+31, (w12 >> 38) , parm);;\ +} + +#define BITUNPACK64_26(ip, op, parm) { \ + BITUNBLK64_26(ip, 0, op, parm); DSTI(op); ip += 26*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_27(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*27+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7ffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 27) & 0x7ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*27+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 54) | (w1 << 10) & 0x7ffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 17) & 0x7ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*27+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w1 >> 44) | (w2 << 20) & 0x7ffffff, parm);\ + DST(op,i*64+ 5, (w2 >> 7) & 0x7ffffff, parm);\ + DST(op,i*64+ 6, (w2 >> 34) & 0x7ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*27+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w2 >> 61) | (w3 << 3) & 0x7ffffff, parm);\ + DST(op,i*64+ 8, (w3 >> 24) & 0x7ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*27+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w3 >> 51) | (w4 << 13) & 0x7ffffff, parm);\ + DST(op,i*64+10, (w4 >> 14) & 0x7ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*27+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w4 >> 41) | (w5 << 23) & 0x7ffffff, parm);\ + DST(op,i*64+12, (w5 >> 4) & 0x7ffffff, parm);\ + DST(op,i*64+13, (w5 >> 31) & 0x7ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*27+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w5 >> 58) | (w6 << 6) & 0x7ffffff, parm);\ + DST(op,i*64+15, (w6 >> 21) & 0x7ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*27+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w6 >> 48) | (w7 << 16) & 0x7ffffff, parm);\ + DST(op,i*64+17, (w7 >> 11) & 0x7ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*27+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w7 >> 38) | (w8 << 26) & 0x7ffffff, parm);\ + DST(op,i*64+19, (w8 >> 1) & 0x7ffffff, parm);\ + DST(op,i*64+20, (w8 >> 28) & 0x7ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*27+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w8 >> 55) | (w9 << 9) & 0x7ffffff, parm);\ + DST(op,i*64+22, (w9 >> 18) & 0x7ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*27+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w9 >> 45) | (w10 << 19) & 0x7ffffff, parm);\ + DST(op,i*64+24, (w10 >> 8) & 0x7ffffff, parm);\ + DST(op,i*64+25, (w10 >> 35) & 0x7ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*27+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w10 >> 62) | (w11 << 2) & 0x7ffffff, parm);\ + DST(op,i*64+27, (w11 >> 25) & 0x7ffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*27+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w11 >> 52) | (w12 << 12) & 0x7ffffff, parm);\ + DST(op,i*64+29, (w12 >> 15) & 0x7ffffff, parm); register uint32_t w13 = *(uint32_t *)(ip+(i*27+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w12 >> 42) | (w13 << 22) & 0x7ffffff, parm);\ + DST(op,i*64+31, (w13 >> 5) & 0x7ffffff, parm);;\ +} + +#define BITUNPACK64_27(ip, op, parm) { \ + BITUNBLK64_27(ip, 0, op, parm); DSTI(op); ip += 27*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_28(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (w0 ) & 0xfffffff, parm);\ + DST(op,i*16+ 1, (w0 >> 28) & 0xfffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 2, (w0 >> 56) | (w1 << 8) & 0xfffffff, parm);\ + DST(op,i*16+ 3, (w1 >> 20) & 0xfffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 4, (w1 >> 48) | (w2 << 16) & 0xfffffff, parm);\ + DST(op,i*16+ 5, (w2 >> 12) & 0xfffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 6, (w2 >> 40) | (w3 << 24) & 0xfffffff, parm);\ + DST(op,i*16+ 7, (w3 >> 4) & 0xfffffff, parm);\ + DST(op,i*16+ 8, (w3 >> 32) & 0xfffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 9, (w3 >> 60) | (w4 << 4) & 0xfffffff, parm);\ + DST(op,i*16+10, (w4 >> 24) & 0xfffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+11, (w4 >> 52) | (w5 << 12) & 0xfffffff, parm);\ + DST(op,i*16+12, (w5 >> 16) & 0xfffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+13, (w5 >> 44) | (w6 << 20) & 0xfffffff, parm);\ + DST(op,i*16+14, (w6 >> 8) & 0xfffffff, parm);\ + DST(op,i*16+15, (w6 >> 36) , parm);;\ +} + +#define BITUNPACK64_28(ip, op, parm) { \ + BITUNBLK64_28(ip, 0, op, parm);\ + BITUNBLK64_28(ip, 1, op, parm); DSTI(op); ip += 28*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_29(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*29+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1fffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 29) & 0x1fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*29+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 58) | (w1 << 6) & 0x1fffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 23) & 0x1fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*29+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w1 >> 52) | (w2 << 12) & 0x1fffffff, parm);\ + DST(op,i*64+ 5, (w2 >> 17) & 0x1fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*29+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w2 >> 46) | (w3 << 18) & 0x1fffffff, parm);\ + DST(op,i*64+ 7, (w3 >> 11) & 0x1fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*29+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w3 >> 40) | (w4 << 24) & 0x1fffffff, parm);\ + DST(op,i*64+ 9, (w4 >> 5) & 0x1fffffff, parm);\ + DST(op,i*64+10, (w4 >> 34) & 0x1fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*29+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w4 >> 63) | (w5 << 1) & 0x1fffffff, parm);\ + DST(op,i*64+12, (w5 >> 28) & 0x1fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*29+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+13, (w5 >> 57) | (w6 << 7) & 0x1fffffff, parm);\ + DST(op,i*64+14, (w6 >> 22) & 0x1fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*29+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w6 >> 51) | (w7 << 13) & 0x1fffffff, parm);\ + DST(op,i*64+16, (w7 >> 16) & 0x1fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*29+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w7 >> 45) | (w8 << 19) & 0x1fffffff, parm);\ + DST(op,i*64+18, (w8 >> 10) & 0x1fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*29+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+19, (w8 >> 39) | (w9 << 25) & 0x1fffffff, parm);\ + DST(op,i*64+20, (w9 >> 4) & 0x1fffffff, parm);\ + DST(op,i*64+21, (w9 >> 33) & 0x1fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*29+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w9 >> 62) | (w10 << 2) & 0x1fffffff, parm);\ + DST(op,i*64+23, (w10 >> 27) & 0x1fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*29+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w10 >> 56) | (w11 << 8) & 0x1fffffff, parm);\ + DST(op,i*64+25, (w11 >> 21) & 0x1fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*29+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w11 >> 50) | (w12 << 14) & 0x1fffffff, parm);\ + DST(op,i*64+27, (w12 >> 15) & 0x1fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*29+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w12 >> 44) | (w13 << 20) & 0x1fffffff, parm);\ + DST(op,i*64+29, (w13 >> 9) & 0x1fffffff, parm); register uint32_t w14 = *(uint32_t *)(ip+(i*29+14)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w13 >> 38) | (w14 << 26) & 0x1fffffff, parm);\ + DST(op,i*64+31, (w14 >> 3) & 0x1fffffff, parm);;\ +} + +#define BITUNPACK64_29(ip, op, parm) { \ + BITUNBLK64_29(ip, 0, op, parm); DSTI(op); ip += 29*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_30(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3fffffff, parm);\ + DST(op,i*32+ 1, (w0 >> 30) & 0x3fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 2, (w0 >> 60) | (w1 << 4) & 0x3fffffff, parm);\ + DST(op,i*32+ 3, (w1 >> 26) & 0x3fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 4, (w1 >> 56) | (w2 << 8) & 0x3fffffff, parm);\ + DST(op,i*32+ 5, (w2 >> 22) & 0x3fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 6, (w2 >> 52) | (w3 << 12) & 0x3fffffff, parm);\ + DST(op,i*32+ 7, (w3 >> 18) & 0x3fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 8, (w3 >> 48) | (w4 << 16) & 0x3fffffff, parm);\ + DST(op,i*32+ 9, (w4 >> 14) & 0x3fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+10, (w4 >> 44) | (w5 << 20) & 0x3fffffff, parm);\ + DST(op,i*32+11, (w5 >> 10) & 0x3fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+12, (w5 >> 40) | (w6 << 24) & 0x3fffffff, parm);\ + DST(op,i*32+13, (w6 >> 6) & 0x3fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w6 >> 36) | (w7 << 28) & 0x3fffffff, parm);\ + DST(op,i*32+15, (w7 >> 2) & 0x3fffffff, parm);\ + DST(op,i*32+16, (w7 >> 32) & 0x3fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*15+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w7 >> 62) | (w8 << 2) & 0x3fffffff, parm);\ + DST(op,i*32+18, (w8 >> 28) & 0x3fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*15+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+19, (w8 >> 58) | (w9 << 6) & 0x3fffffff, parm);\ + DST(op,i*32+20, (w9 >> 24) & 0x3fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*15+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+21, (w9 >> 54) | (w10 << 10) & 0x3fffffff, parm);\ + DST(op,i*32+22, (w10 >> 20) & 0x3fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*15+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+23, (w10 >> 50) | (w11 << 14) & 0x3fffffff, parm);\ + DST(op,i*32+24, (w11 >> 16) & 0x3fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*15+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+25, (w11 >> 46) | (w12 << 18) & 0x3fffffff, parm);\ + DST(op,i*32+26, (w12 >> 12) & 0x3fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*15+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+27, (w12 >> 42) | (w13 << 22) & 0x3fffffff, parm);\ + DST(op,i*32+28, (w13 >> 8) & 0x3fffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*15+14)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+29, (w13 >> 38) | (w14 << 26) & 0x3fffffff, parm);\ + DST(op,i*32+30, (w14 >> 4) & 0x3fffffff, parm);\ + DST(op,i*32+31, (w14 >> 34) , parm);;\ +} + +#define BITUNPACK64_30(ip, op, parm) { \ + BITUNBLK64_30(ip, 0, op, parm); DSTI(op); ip += 30*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_31(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*31+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7fffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 31) & 0x7fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*31+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 62) | (w1 << 2) & 0x7fffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 29) & 0x7fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*31+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w1 >> 60) | (w2 << 4) & 0x7fffffff, parm);\ + DST(op,i*64+ 5, (w2 >> 27) & 0x7fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*31+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w2 >> 58) | (w3 << 6) & 0x7fffffff, parm);\ + DST(op,i*64+ 7, (w3 >> 25) & 0x7fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*31+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w3 >> 56) | (w4 << 8) & 0x7fffffff, parm);\ + DST(op,i*64+ 9, (w4 >> 23) & 0x7fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*31+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+10, (w4 >> 54) | (w5 << 10) & 0x7fffffff, parm);\ + DST(op,i*64+11, (w5 >> 21) & 0x7fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*31+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w5 >> 52) | (w6 << 12) & 0x7fffffff, parm);\ + DST(op,i*64+13, (w6 >> 19) & 0x7fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*31+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w6 >> 50) | (w7 << 14) & 0x7fffffff, parm);\ + DST(op,i*64+15, (w7 >> 17) & 0x7fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*31+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w7 >> 48) | (w8 << 16) & 0x7fffffff, parm);\ + DST(op,i*64+17, (w8 >> 15) & 0x7fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*31+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w8 >> 46) | (w9 << 18) & 0x7fffffff, parm);\ + DST(op,i*64+19, (w9 >> 13) & 0x7fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*31+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+20, (w9 >> 44) | (w10 << 20) & 0x7fffffff, parm);\ + DST(op,i*64+21, (w10 >> 11) & 0x7fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*31+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w10 >> 42) | (w11 << 22) & 0x7fffffff, parm);\ + DST(op,i*64+23, (w11 >> 9) & 0x7fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*31+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w11 >> 40) | (w12 << 24) & 0x7fffffff, parm);\ + DST(op,i*64+25, (w12 >> 7) & 0x7fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*31+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w12 >> 38) | (w13 << 26) & 0x7fffffff, parm);\ + DST(op,i*64+27, (w13 >> 5) & 0x7fffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*31+14)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w13 >> 36) | (w14 << 28) & 0x7fffffff, parm);\ + DST(op,i*64+29, (w14 >> 3) & 0x7fffffff, parm); register uint32_t w15 = *(uint32_t *)(ip+(i*31+15)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w14 >> 34) | (w15 << 30) & 0x7fffffff, parm);\ + DST(op,i*64+31, (w15 >> 1) & 0x7fffffff, parm);;\ +} + +#define BITUNPACK64_31(ip, op, parm) { \ + BITUNBLK64_31(ip, 0, op, parm); DSTI(op); ip += 31*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_32(ip, i, op, parm) { \ + DST(op,i*2+ 0, *(uint32_t *)(ip+i*8+ 0), parm);\ + DST(op,i*2+ 1, *(uint32_t *)(ip+i*8+ 4), parm);;\ +} + +#define BITUNPACK64_32(ip, op, parm) { \ + BITUNBLK64_32(ip, 0, op, parm);\ + BITUNBLK64_32(ip, 1, op, parm);\ + BITUNBLK64_32(ip, 2, op, parm);\ + BITUNBLK64_32(ip, 3, op, parm);\ + BITUNBLK64_32(ip, 4, op, parm);\ + BITUNBLK64_32(ip, 5, op, parm);\ + BITUNBLK64_32(ip, 6, op, parm);\ + BITUNBLK64_32(ip, 7, op, parm);\ + BITUNBLK64_32(ip, 8, op, parm);\ + BITUNBLK64_32(ip, 9, op, parm);\ + BITUNBLK64_32(ip, 10, op, parm);\ + BITUNBLK64_32(ip, 11, op, parm);\ + BITUNBLK64_32(ip, 12, op, parm);\ + BITUNBLK64_32(ip, 13, op, parm);\ + BITUNBLK64_32(ip, 14, op, parm);\ + BITUNBLK64_32(ip, 15, op, parm); DSTI(op); ip += 32*4/sizeof(ip[0]);\ +} + diff --git a/bitunpack_.h b/bitunpack_.h new file mode 100644 index 0000000..172e3d4 --- /dev/null +++ b/bitunpack_.h @@ -0,0 +1,112 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack_.h - "Integer Compression" binary packing +**/ + +#include +#define DST( __op,__x, __w, __parm) *__op++ = BPI(__w,__parm) //__op[__x] = BPI(__w,__parm) // +#define DSTI(__op) //__op += 32 // + +#define USE_BITUNPACK 64 + + #if USE_BITUNPACK == 64 +#include "bitunpack64_.h" +#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0: do BITUNPACK64_0( __ip, __op, __parm) while(__op<__ope); break;\ + case 1: do BITUNPACK64_1( __ip, __op, __parm) while(__op<__ope); break;\ + case 2: do BITUNPACK64_2( __ip, __op, __parm) while(__op<__ope); break;\ + case 3: do BITUNPACK64_3( __ip, __op, __parm) while(__op<__ope); break;\ + case 4: do BITUNPACK64_4( __ip, __op, __parm) while(__op<__ope); break;\ + case 5: do BITUNPACK64_5( __ip, __op, __parm) while(__op<__ope); break;\ + case 6: do BITUNPACK64_6( __ip, __op, __parm) while(__op<__ope); break;\ + case 7: do BITUNPACK64_7( __ip, __op, __parm) while(__op<__ope); break;\ + case 8: do BITUNPACK64_8( __ip, __op, __parm) while(__op<__ope); break;\ + case 9: do BITUNPACK64_9( __ip, __op, __parm) while(__op<__ope); break;\ + case 10: do BITUNPACK64_10(__ip, __op, __parm) while(__op<__ope); break;\ + case 11: do BITUNPACK64_11(__ip, __op, __parm) while(__op<__ope); break;\ + case 12: do BITUNPACK64_12(__ip, __op, __parm) while(__op<__ope); break;\ + case 13: do BITUNPACK64_13(__ip, __op, __parm) while(__op<__ope); break;\ + case 14: do BITUNPACK64_14(__ip, __op, __parm) while(__op<__ope); break;\ + case 15: do BITUNPACK64_15(__ip, __op, __parm) while(__op<__ope); break;\ + case 16: do BITUNPACK64_16(__ip, __op, __parm) while(__op<__ope); break;\ + case 17: do BITUNPACK64_17(__ip, __op, __parm) while(__op<__ope); break;\ + case 18: do BITUNPACK64_18(__ip, __op, __parm) while(__op<__ope); break;\ + case 19: do BITUNPACK64_19(__ip, __op, __parm) while(__op<__ope); break;\ + case 20: do BITUNPACK64_20(__ip, __op, __parm) while(__op<__ope); break;\ + case 21: do BITUNPACK64_21(__ip, __op, __parm) while(__op<__ope); break;\ + case 22: do BITUNPACK64_22(__ip, __op, __parm) while(__op<__ope); break;\ + case 23: do BITUNPACK64_23(__ip, __op, __parm) while(__op<__ope); break;\ + case 24: do BITUNPACK64_24(__ip, __op, __parm) while(__op<__ope); break;\ + case 25: do BITUNPACK64_25(__ip, __op, __parm) while(__op<__ope); break;\ + case 26: do BITUNPACK64_26(__ip, __op, __parm) while(__op<__ope); break;\ + case 27: do BITUNPACK64_27(__ip, __op, __parm) while(__op<__ope); break;\ + case 28: do BITUNPACK64_28(__ip, __op, __parm) while(__op<__ope); break;\ + case 29: do BITUNPACK64_29(__ip, __op, __parm) while(__op<__ope); break;\ + case 30: do BITUNPACK64_30(__ip, __op, __parm) while(__op<__ope); break;\ + case 31: do BITUNPACK64_31(__ip, __op, __parm) while(__op<__ope); break;\ + case 32: do BITUNPACK64_32(__ip, __op, __parm) while(__op<__ope); break;\ + }\ +} + #elif USE_BITUNPACK == 32 +#include "bitunpack32_.h" +#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0: do BITUNPACK32_0( __ip, __op, __parm) while(__op<__ope); break;\ + case 1: do BITUNPACK32_1( __ip, __op, __parm) while(__op<__ope); break;\ + case 2: do BITUNPACK32_2( __ip, __op, __parm) while(__op<__ope); break;\ + case 3: do BITUNPACK32_3( __ip, __op, __parm) while(__op<__ope); break;\ + case 4: do BITUNPACK32_4( __ip, __op, __parm) while(__op<__ope); break;\ + case 5: do BITUNPACK32_5( __ip, __op, __parm) while(__op<__ope); break;\ + case 6: do BITUNPACK32_6( __ip, __op, __parm) while(__op<__ope); break;\ + case 7: do BITUNPACK32_7( __ip, __op, __parm) while(__op<__ope); break;\ + case 8: do BITUNPACK32_8( __ip, __op, __parm) while(__op<__ope); break;\ + case 9: do BITUNPACK32_9( __ip, __op, __parm) while(__op<__ope); break;\ + case 10: do BITUNPACK32_10(__ip, __op, __parm) while(__op<__ope); break;\ + case 11: do BITUNPACK32_11(__ip, __op, __parm) while(__op<__ope); break;\ + case 12: do BITUNPACK32_12(__ip, __op, __parm) while(__op<__ope); break;\ + case 13: do BITUNPACK32_13(__ip, __op, __parm) while(__op<__ope); break;\ + case 14: do BITUNPACK32_14(__ip, __op, __parm) while(__op<__ope); break;\ + case 15: do BITUNPACK32_15(__ip, __op, __parm) while(__op<__ope); break;\ + case 16: do BITUNPACK32_16(__ip, __op, __parm) while(__op<__ope); break;\ + case 17: do BITUNPACK32_17(__ip, __op, __parm) while(__op<__ope); break;\ + case 18: do BITUNPACK32_18(__ip, __op, __parm) while(__op<__ope); break;\ + case 19: do BITUNPACK32_19(__ip, __op, __parm) while(__op<__ope); break;\ + case 20: do BITUNPACK32_20(__ip, __op, __parm) while(__op<__ope); break;\ + case 21: do BITUNPACK32_21(__ip, __op, __parm) while(__op<__ope); break;\ + case 22: do BITUNPACK32_22(__ip, __op, __parm) while(__op<__ope); break;\ + case 23: do BITUNPACK32_23(__ip, __op, __parm) while(__op<__ope); break;\ + case 24: do BITUNPACK32_24(__ip, __op, __parm) while(__op<__ope); break;\ + case 25: do BITUNPACK32_25(__ip, __op, __parm) while(__op<__ope); break;\ + case 26: do BITUNPACK32_26(__ip, __op, __parm) while(__op<__ope); break;\ + case 27: do BITUNPACK32_27(__ip, __op, __parm) while(__op<__ope); break;\ + case 28: do BITUNPACK32_28(__ip, __op, __parm) while(__op<__ope); break;\ + case 29: do BITUNPACK32_29(__ip, __op, __parm) while(__op<__ope); break;\ + case 30: do BITUNPACK32_30(__ip, __op, __parm) while(__op<__ope); break;\ + case 31: do BITUNPACK32_31(__ip, __op, __parm) while(__op<__ope); break;\ + case 32: do BITUNPACK32_32(__ip, __op, __parm) while(__op<__ope); break;\ + } /*printf("n=%d,%d,%d ", __n, __op, __parm - sd, __op, __parme - __op);*/\ +} +#endif + diff --git a/conf.h b/conf.h new file mode 100644 index 0000000..2383ad1 --- /dev/null +++ b/conf.h @@ -0,0 +1,70 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + conf.h - "Integer Compression" config & common +**/ + + + #if defined(__GNUC__) +#define ALIGNED(t,v,n) __attribute__ ((aligned (n))) t v +#define ALWAYS_INLINE __attribute__((always_inline)) +#define _PACKED __attribute__ ((packed)) +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#define popcnt32(__x) __builtin_popcount(__x) +#define popcnt64(__x) __builtin_popcountll(__x) + +#define TEMPLATE2_(__x, __y) __x##__y +#define TEMPLATE2(__x, __y) TEMPLATE2_(__x,__y) + +#define TEMPLATE3_(x,y,z) x ## ## y ## z +#define TEMPLATE3(x,y,z) TEMPLATE3_(x, y, z) + + #if defined(__x86_64__) || defined(__x86_32__) +static inline int bsr32(int x) { + int b = -1; + asm("bsrl %1,%0" : "+r" (b): "rm" (x) ); + return b + 1; +} + +static inline int bsr64(unsigned long long x) { + return x?64 - __builtin_clzll(x):0; +} + +#define bsr16(__x) bsr32(__x) + #else +static inline int bsr32(int x) { + return x?32 - __builtin_clz(x):0; +} + +static inline int bsr64(unsigned long long x) { + return x?64 - __builtin_clzll(x):0; +} + #endif +#define ctzll(__x) __builtin_ctzll(__x) + #else +#error "only gcc support in this version" + #endif + + diff --git a/icbench.c b/icbench.c new file mode 100644 index 0000000..d417e9f --- /dev/null +++ b/icbench.c @@ -0,0 +1,617 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + icbench.c - "Integer Compression" benchmark program +**/ + +#include +#include +#include +#include + +#include +#include +#define PGM_FD(__f) struct stat sbuf; fstat(__f, &sbuf); __off64_t vlen = sbuf.st_size, vtel = 0; int pgm = 0; time_t t0 = time(NULL); +#define PGM_FDPUT(__f) vtel = lseek(__f, 0, SEEK_CUR);if(vtel*10/vlen != pgm) { double secs = time(NULL) - t0; pgm = vtel*10/vlen; printf("%d%%%.1f ", pgm, ((secs/60.0) * (vlen - vtel))/vtel); fflush(stdout); } +//------------------------------------------------------------------------------------------------------------- +typedef unsigned long long tm_t; +#define TM_TMAX (1ull<<63) + + #ifdef _MSC_VER // __rdtsc +#include + #else +#include + #endif + + #ifdef _WIN32 +#include +#define TM_T 1 + +static tm_t tmtime(void) { + LARGE_INTEGER tm; + QueryPerformanceCounter(&tm); + return (tm_t)(tm.QuadPart/tps.QuadPart); +} + +LARGE_INTEGER tps; +static tm_t tminit() { QueryPerformanceFrequency(&tps); tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } + #else +#include +#define TM_T 1000000.0 +static tm_t tmtime(void) { + struct timeval tm; + gettimeofday(&tm, NULL); + return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; +} + +static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } + #endif +//-------------------------------------------------------------------------------------------------------- +#include "vint.h" +#include "vsimple.h" + +#include "bitpack.h" +#include "bitunpack.h" +#include "vp4dc.h" +#include "vp4dd.h" + +#include "aux/vas16c.h" +#include "aux/vas16d.h" +#include "aux/OPT_PFD/opt_p4.h" +#include "aux/vabyte.h" +#include "aux/simple8b.h" +#include "aux/varintg8iu.h" + +unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b); + return out; +} + +unsigned char *simdpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpack(in, (__m128i *)out, b); + return out; +} + +unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { + uint32_t k, *out_; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack(in, out, b); + return in; +} + +unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); //simdpackwithoutmaskd1(x, ip+1, (__m128i *)out, b); + return out; +} + +unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { + uint32_t k, *out_; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, in, out, b); + return in; +} + +unsigned char *u32enc(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) *out++ = *in++; return out;} +unsigned char *u32dec(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return in;} + +#include "aux/vbyte_poly.h" +unsigned char *vavbyte1enc(int *in, int n, unsigned char *out) { + int i; for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } return out; +} +void vavbyte1dec(unsigned char *in, int n, int *out) { + int i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return out; +} + +//------------------------------------------------------------------------------------------------- +#define VBLIM 64 +enum { + P_CPY, + P_VB, P_VBL, P_VG8, + P_PCK, P_PCKR, P_SIMDH, + P_SV, P_S16, P_S8BO, + P_P4D, P_P4DR, P_OPTP4 +}; + +unsigned char *beenc(unsigned *in, size_t n, unsigned char *out, int id, int bb) { + unsigned *ip=in; + int i,b; + + switch(id) { + case P_CPY: + out = u32enc( ip, n, out); break; + case P_VB: + out = vbenc( ip, n, out); break; + case P_VBL: + out = vbyteenc( ip, n, out); break; + case P_VG8: + out = vintg8enc(ip, n, out); break; + + //----------- simple ------------------- + case P_SV: + out = vsenc32( ip, n, out); break; + case P_S16: + { unsigned *c=ip,*ce=c+n; + while(c < ce) S16ENC(out, c, ce - c); + } + break; + case P_S8BO: + out = s8benco( ip, n, out); + break; + + //----------- PFOR ------------------- + case P_P4DR: + case P_P4D: + if(n>= 5; + } + *op = x; + in = bitunpack32( in, n-1, b, op+1); + } + break; + case P_PCKR: + { + unsigned x; + vbgeta(in, x, ;); + if(bb < 0) { + b = x & 0x1f; x >>= 5; + } + *op = x; + in = _bitunpackx32(in, n-1, b, op+1); + } + break; + case P_SIMDH: + if(n <129) in = vbytedec(in, n, op); + else { + unsigned x; + vbgeta(in, x, ;); + if(bb < 0) { + b = x & 0x1f; x >>= 5; + } + *op = x; + in = simdunpackn( in, n-1, b, op+1); + } + break; + default: printf("Fatal- Not entry %d", id); exit(0); + } + return in; +} + +struct libss { int id; char *s,*v; }; + +struct libss libss[] = { + { P_CPY, "copy", }, + { P_VB, "TurboVbyte" }, + { P_VBL, "Vbyte FPF" }, + { P_VG8, "vg8iu" }, + + { P_SV, "simpleV" }, + { P_S8BO, "simple 8b" }, + { P_S16, "simple16" }, + + { P_P4DR, "TurboPFor DA" }, + { P_P4D, "TurboPFor" }, + { P_OPTP4, "OptP4" }, + + { P_PCK, "TurboPack" }, + { P_PCKR, "TurboPack DA" }, + { P_SIMDH, "SIMDBitPack FPF" }, + { -1, "" }, +}; + +//--------------------------------------------------------------------------------------------- +#define MAXT 8 +#define BLK_SIZE 129 +#define MB (1024*1024) + +int verb = 0, reps = 100000, trips = 3; +enum { T_ZIPF=1, T_ID }; + +struct libs { int id,err; char *s,*v; unsigned long long l; double tc,td; }; +struct libs libs[64]; + +int l_cmp(struct libs *a, struct libs *b) { + if(a->l < b->l || a->l == b->l && a->td < b->td) return -1; + if(a->l > b->l || a->l == b->l && a->td > b->td) return 1; + return 0; +} + +void check(unsigned *in, unsigned n, unsigned *out, char *s) { + unsigned k,j; + for(k = 0; k < n; k++) + if(in[k] != out[k]) { + printf("\nFATAL in check %x,%x at %u[%u] in %s\n", in[k], out[k], k, n, s); + for(j=k & 0xffffff80u; j < k+128;j++) + printf("%d:%x,%x ", j, in[j], out[j] );printf("\n"); + exit(0); + } +} + +void print(unsigned long long n, char *s) { + int m, k; + for(k = 0; libs[k].id >= 0; k++); + qsort(libs, k, sizeof(libs[0]), l_cmp); + + for(m = 0; m < k; m++) + if(libs[m].l) { + struct libs *lb = &libs[m]; + printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", s, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n, + lb->tc>=0.000001?((double)n/1000000.0) / (lb->tc/TM_T):0.0, + lb->td>=0.000001?((double)n/1000000.0) / (lb->td/TM_T):0.0, + lb->s ); + } +} + +//int libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; } + +unsigned bench(unsigned *__restrict__ _in, unsigned _inlen, int blksize, unsigned char *__restrict__ _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict__ cpy, int bb) { int m,id,b=bb,i; if(verb) { printf(":%d,", _inlen); fflush(stdout);} + unsigned cn; tm_t tt0 = tminit(); + for(i = 0; i < 10; i++) memcpy(_out, _in, _inlen); + for(m = 0; (id=libs[m].id) >= 0; m++) { int r,insize=(id==P_OPTP4)?blksize-1:blksize; + struct libs *lb = &libs[m]; unsigned cl; if(verb) { printf("%s", libs[m].s);fflush(stdout); } int t,tj; tm_t t0,tc=TM_TMAX,td=TM_TMAX,tt; + for(t = 0; t < trips; t++) { t0 = tminit(); + for(r = 0; r < reps; ) { + cn=cl=0; + unsigned *in; + unsigned char *out,*sout; //vsini(); + for(out = _out, in = _in; in < _in+_inlen; ) { + unsigned n,inlen = *in++,*ip=in; in += inlen; + *(unsigned *)out = inlen; out+=4;/*out++=0x5a;*/ + for(;ip < in; ip += n) { n = ip+insize<=in?insize:in-ip; cn += n; unsigned char *sout=out; //printf("%d ", n); + out = beenc(ip,n,out,id,bb); + cl +=out-sout; + } if(out > _out+outsize) { fprintf(stderr, "Overflow error %lld, %lld in %s\n", outsize, (ptrdiff_t)(out - _out), lb->s); exit(0); } + } r++; if((tt = tmtime() - t0) > tx) break; + } if(tt < tc) { tc = tt; tj = r; } + if(tmtime() - tt0 > tx*trips) { /*printf("#");fflush(stdout);*/ /*sleep(1);*/tt0 = tminit(); } + } + lb->l += cl; lb->tc += tc/tj; memset(cpy, 0xf, _inlen*4); if(verb) { printf("+ ");fflush(stdout);} + tt0 = tminit(); + for(t = 0; t < trips; t++) { t0 = tminit(); + for(r = 0; r < reps; ) { unsigned *out; unsigned char *in; + for(out = cpy, in = _out; out < cpy+_inlen;) { + unsigned n,*op, outlen=*(unsigned *)in; in+=4; + *out++ = outlen; + for(op=out,out += outlen; op < out; op += n) { + n = op + insize<=out?insize:out-op; + in = bedec(in,n,op,id,bb); + } + } + r++; + if((tt = tmtime() - t0) > tx) + break; + } + if(tt < td) { + td = tt; + tj = r; + } + if(tmtime() - tt0 > tx*trips) { + tt0 = tminit(); + } + } lb->td += td/tj; + check(_in, _inlen, cpy, lb->s); + } + return cn; +} + +int z_cmp(double **a, double **b) { + if(*a < *b) return -1; + if(*a > *b) return 1; + return 0; +} + +void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) { + int i,m = x2 - x1 + 1; + double prob, cum, *zmap; + if(!(zmap = malloc(m*sizeof(zmap[0])))) { + fprintf(stderr, "mallo error\n"); + exit(-1); + }; + + srand48(1); + for(cum =0.0,i = 0; i < m; i++) + cum += 1.0 / pow(i+1, alpha); + cum = 1.0 / cum; + for(prob=0.0,i = 0; i < m; i++) + zmap[i] = prob += cum / pow(i+1, alpha); + qsort(zmap, m, sizeof(zmap[0]), (int(*)(const void*,const void*))z_cmp); + + for(i = 0; i < n; i++) { + double r = drand48(); + int l = 0, h = m-1; + while(l < h) { + int k = (l + h) >> 1; + if(r > zmap[k]) l = k + 1; + else h = k; + } + a[i] = x1 + l; + } + free(zmap); +} + +#define OVD (10*MB) +int main(int argc, char *argv[]) { + char fname[0x100], *cmd=NULL; + unsigned bp=0,ftype = T_ID, rm=0,rx=30,n=10000000; + long long rdmax = 1<<30; tm_t tx=1*1000000; + double a = 1.5; + + tminit(); + VarIntG8IU(); + + int c, digit_optind = 0; + int this_option_optind = optind ? optind : 1, option_index = 0; + static struct option long_options[] = { {"repeat", 0, 0, 'r'}, {0,0, 0, 0} }; + for(;;) { + if((c = getopt_long(argc, argv, "Ac:TBR:ys:r:n:b:c:e:t:r:M:v:m:x:a:", long_options, &option_index)) == -1) break; + switch(c) { + case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + case 'r': reps = atoi(optarg); break; + case 'R': trips = atoi(optarg); break; + case 'v': verb = atoi(optarg);verb++; break; + case 't': tx = atoi(optarg)*1000000; break; + case 'c': ftype = atoi(optarg); break; + case 'b': rdmax = atoi(optarg)*MB; break; + case 'e': cmd=optarg; break; + case 'm': rm = atoi(optarg); break; + case 'x': rx = atoi(optarg); break; // + case 'B': bp++; break; + case 'n': n = atoi(optarg); break; + case 'a': a = strtod(optarg, NULL); break; + default: fprintf(stdout,"unknown option: %c \n", optopt); exit(1); + } + } + int fno,i=0; //libini(); + if(!bp) { rm = (1< n) rx = n; } else if(!rm) rm = 1; + //printf("range=(%d,%d,%d)\n", rm, rx, n);fflush(stdout); + struct libss *ls; + if(cmd) { + unsigned char *q=NULL; + for(i=0,libs[0].id = -1;;) { + if(cmd) { + if(!*cmd) break; //printf("cmd='%s'", cmd); + q = strchr(cmd,','); + if(q) *q=' '; + if(q = strchr(cmd,'/')) + *q = '\0'; + for(ls = libss; ls->id >= 0; ls++) + if(!strcasecmp(ls->s, cmd)) { + memset(&libs[i], 0, sizeof(struct libs)); + libs[i].id = ls->id; + libs[i].err = 0; + libs[i].s = ls->s; + libs[i++].v = ls->v; + break; + } + if(ls->id < 0) { + printf("library: '%s' not found\n", cmd); + exit(-1); + } + cmd = q?(q+1):""; + } + } + } else for(ls = libss; ls->id >= 0; ls++) { + libs[i].id = ls->id; + libs[i].err = 0; + libs[i].s = ls->s; //printf("%s\n", ls->s);fflush(stdout); + libs[i++].v = ls->v; + } + libs[i].id = -1; + + if(argc <= optind) { + unsigned *in, *out, *cpy,*ip; unsigned long long totlen=0; + in = malloc(n*4+OVD); if(!in) { printf("malloc err=%u", n); exit(0); } + out = malloc(n*4+OVD); if(!out) { printf("malloc err=%u", n); exit(0); } + cpy = malloc(n*4+OVD); if(!cpy) { printf("malloc err=%u", n); exit(0); } + char s[33]; s[0]=0; + if(bp) { + int b; + printf("bittest\n"); fflush(stdout); + for(b = rm; b <= rx; b++) { + sprintf(s,"b=%d", b); + *in = n; + for(i = 1; i <= n; i++) + in[i] = (1ull << b)-1; + totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, b); + print(totlen, s); + } + } else { + printf("zipf a=%3.1f [%u,%u]\n", a, rm, rx); + *in = n; + zipfgen(in+1, a, rm, rx, n); //stprint(); + totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, -1); + print(totlen, s); + } + free(in); + free(cpy); + free(out); + } else for(fno = optind; fno < argc; fno++) { + char *inname = argv[fno]; + FILE *fi = fopen64(inname, "r"); + if(!fi) { + fprintf(stderr, "open error '%s'", inname); perror(inname); + exit(-1); + } + fseek(fi, 0, SEEK_END); + unsigned long long fisize = ftell(fi); + fseek(fi, 0, SEEK_SET); + if(fisize > rdmax) + fisize = rdmax; + fisize /= 4; //setvbuf(fi, NULL, _IOFBF, 1000*MB); + unsigned *in, *out, *cpy,*ip; + unsigned long long totlen=0; + int rc; + out = malloc(fisize*4+OVD); if(!out) { printf("malloc err=%u", fisize); exit(0); } + cpy = malloc(fisize*4+OVD); if(!cpy) { printf("malloc err=%u", fisize); exit(0); } + in = malloc(fisize*4+1024); if(!in) { printf("malloc err=%u", fisize); exit(0); } PGM_FD(fileno(fi)); + int r; fread(&r, 4, 1, fi); + while(r > 0) { + for(ip = in; ip+r <= in+fisize;) { + int rc; PGM_FDPUT(fileno(fi)); + if((rc = fread(ip+1, 4, r, fi)) <= 0) + goto a; + + if(r >= rm && r <= rx) { + *ip++ = r; + int j; + if(verb) + printf("%d ", r, ftype==T_ID?"I":"N"); + fflush(stdout); + if(ftype == T_ID) { + for(j = 0; j < r; ) { + unsigned m = j+BLK_SIZE>r?r-j:BLK_SIZE; + int i,did,dido = -1; + for(i = 0; i < m; i++) { + did = ip[i]; + if(did < dido) { + printf("IDs in '%s' not sorted.did=%d,dido=%d ", inname, did, dido); + exit(0); + } + ip[i] = did - dido - 1; + dido = /*ip[0]*/did; //printf("%d,", ip[i]); xbits[bsr32(ip[i])]++; + } + j += m; ip += m; //printf("\r"); + } + } else + ip += r; + } + r = rc = 0; + if(ftype == T_ID) + rc = fread(&r, 4, 1, fi); + if(rc <= 0 || !r) + break; + } + totlen += bench(in, ip-in, BLK_SIZE, out, fisize*4+OVD, inname, tx, cpy, -1); + if(totlen > n) + break; + } + a:fclose(fi); //stprint(); + print(totlen,inname); + free(in); + free(cpy); + free(out); + } +} + diff --git a/makefile b/makefile new file mode 100644 index 0000000..0488c6e --- /dev/null +++ b/makefile @@ -0,0 +1,28 @@ +# powturbo (c) Copyright 2007-2013 +CFLAGS=-ffast-math -fstrict-aliasing -march=native -w -fpermissive + +BIT=./ +all: icbench + +bitunpack.o: $(BIT)bitunpack.c $(BIT)bitunpack_.h $(BIT)bitunpack.h $(BIT)bitunpack64_.h + cc -O2 $(CFLAGS) -c $(BIT)bitunpack.c + +bitpack.o: $(BIT)bitpack.c $(BIT)bitpack_.h $(BIT)bitpack.h $(BIT)bitpack64_.h + cc -O2 $(CFLAGS) -c $(BIT)bitpack.c + +vp4dc.o: $(BIT)vp4dc.c + cc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c + +SIMDCOMPD=aux/simdcomp/ +SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o + +varintg8iu.o: $(BIT)aux/varintg8iu.c $(BIT)aux/varintg8iu.h + cc -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)aux/varintg8iu.c + +icbench: icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o varintg8iu.o vp4dd.o vp4dc.o $(SIMDCOMP) + cc -O3 icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o vp4dd.o vp4dc.o varintg8iu.o $(SIMDCOMP) -lm -o icbench $(LFLAGS) + +.c.o: + cc -O3 $(CFLAGS) $< -c -o $@ + + diff --git a/vint.h b/vint.h new file mode 100644 index 0000000..5169b5c --- /dev/null +++ b/vint.h @@ -0,0 +1,70 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vint.h - "Integer Compression" variable byte +**/ + +#ifndef VINT_H +#define VINT_H +#include "conf.h" +//-------------------------------------- variable byte : 32 bits ---------------------------------------------------------------- + //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 +static unsigned char vtab[]= { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 }; +#define vbvlen(__x) vtab[(__x)&0xf] + +#define vbputa(__op, __x, __act) {\ + if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\ + else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\ + else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\ + else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\ + else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\ +} + +#define vbgeta(__ip, __x, __act) do { __x = *__ip;\ + if(!(__x & (1<<0))) { __x >>= 1; __ip++; __act;}\ + else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\ + else if(!(__x & (1<<2))) { __x = (*(unsigned short *)__ip) >> 3 | *(__ip+2) << 13; __ip += 3; __act;}\ + else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\ + else { __x = (*(unsigned *)__ip) >> 4 | *(__ip+4) << 28; __ip += 5; __act;}\ +} while(0) + +#define vblen(_x_) ({ unsigned __x = _x_; __x > 0x7f?(__x > 0x3fff?(__x > 0x1fffff?(__x > 0x0fffffff?5:4):3):2):1; }) +#define vbput(__op, __x) { unsigned _x__ = __x; vbputa(__op, _x__, ;); } +#define vbget(__ip) ({ unsigned _x_; vbgeta(__ip, _x_, ;); _x_; }) + +static inline unsigned char *vbenc (unsigned *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) vbput(out, *in++); return out;} +static inline unsigned char *vbdec (unsigned char *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n,x; while(out < out_) vbgeta(in, x, *out++ = x); return in;} + +//--------------------------------------- variable byte : 15 bits ------------------------------------------------------------------- +#define vblen16(__x) ((__x) > 0x7f?2:1) +#define vbput16(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0) +#define vbgeta16(__ip,__x, __act) do { if((__x = *__ip++) > 0x7f) __x = (__x & 0x7f) << 8 | *__ip++; __act; } while(0) +#define vbget16(__ip) ({ unsigned _x; vbgeta16(__ip, _x, ;); _x; }) + +static inline unsigned char *vbenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned short *in_ = in +n; while(in < in_) vbput16(out, *in++); return out;} +static inline unsigned char *vbdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out) { unsigned short *out_ = out+n,x; while(out < out_) vgeta16(in, x, *out++ = x); return in; } + +#endif + + + diff --git a/vp4dc.c b/vp4dc.c new file mode 100644 index 0000000..17d323f --- /dev/null +++ b/vp4dc.c @@ -0,0 +1,41 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd.c - "Integer Compression" Turbo PforDelta +**/ + +#include "conf.h" +#include "bitpack.h" +#include "vp4dc.h" + +#define PAD8(__x) ( (((__x)+8-1)/8) ) +#include + +#define USIZE 32 +#include "vp4dc_.h" + +#define USIZE 16 +#include "vp4dc_.h" + + + diff --git a/vp4dc.h b/vp4dc.h new file mode 100644 index 0000000..e23a94b --- /dev/null +++ b/vp4dc.h @@ -0,0 +1,27 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dc.h - "Integer Compression" Turbo PforDelta +**/ +unsigned char *p4denc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *p4denc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out); diff --git a/vp4dc_.h b/vp4dc_.h new file mode 100644 index 0000000..75fd9f3 --- /dev/null +++ b/vp4dc_.h @@ -0,0 +1,62 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dc_.c - "Integer Compression" Turbo PforDelta +**/ +#define uint_t TEMPLATE3(uint, USIZE, _t) + +unsigned char *TEMPLATE2(p4denc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ out) { + int i; unsigned cnt[USIZE+1] = {0}; uint_t b = 0; + for(i = 0; i < n; i++) b |= in[i], ++cnt[TEMPLATE2(bsr, USIZE)(in[i])]; + b = TEMPLATE2(bsr, USIZE)(b); + + unsigned xb=b, ml = PAD8(n*b)+1,x = cnt[b]; + for(i = b-1; i >= 0; i--) { + unsigned l = PAD8(n*i) + (x?(2+16+PAD8(x*(xb-i))):1); + if(l < ml) b = i, ml = l; + x += cnt[i]; /*if(x >= 64) break;*/ + } + if(xb == b) { + *out++ = b << 1; + return TEMPLATE2(bitpack, USIZE)(in, n, b, out); + } + xb-=b; + uint_t _in[0x100], inx[0x100]; unsigned miss[0x100]; + unsigned long long xmap[2]; xmap[0] = xmap[1] = 0; unsigned xn, msk = (1ull< msk; + } + for(i = 0; i < xn; i++) { + unsigned c = miss[i]; + inx[i] = in[c] >> b; + xmap[c>>6] |= (1ull<<(c&0x3f)); + } + *(unsigned short *)out = xb << 8 | b << 1 | 1; out += 2; out = TEMPLATE2(bitpack, USIZE)(_in, n, b, out); + *(unsigned long long *)out = xmap[0]; out += 8; + *(unsigned long long *)out = xmap[1]; out += 8; + memset(&inx[xn],0,128); + return TEMPLATE2(bitpack, USIZE)(inx, xn, xb, out); +} + diff --git a/vp4dd.c b/vp4dd.c new file mode 100644 index 0000000..2d9e452 --- /dev/null +++ b/vp4dd.c @@ -0,0 +1,40 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd.c - "Integer Compression" Turbo PforDelta +**/ + +#include "conf.h" +#include "bitunpack.h" +#include "vp4dd.h" + +#define PAD8(__x) ( (((__x)+8-1)/8) ) +#include +#define USIZE 32 +#include "vp4dd_.h" + +//#define USIZE 16 +//#include "vp4dd_.h" + + + diff --git a/vp4dd.h b/vp4dd.h new file mode 100644 index 0000000..71af111 --- /dev/null +++ b/vp4dd.h @@ -0,0 +1,73 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd.h - "Integer Compression" Turbo PforDelta +**/ +unsigned char *p4ddec32( unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); +unsigned char *p4ddecx32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); + +//----------------------------------------------------------------------- +#define P4D_PAD8(__x) ( (((__x)+8-1)/8) ) +#define P4D_XB(__x) ((__x & 1)?(__x >> 8):0) +#define P4D_B(__x) ((__x >> 1) & 0x3f) +#define P4D_ININC(__in, __x) __in += 1+(__x & 1) + +static inline unsigned vp4dbits(unsigned char *__restrict__ in, int *xb) { unsigned i = *(unsigned short *)in; *xb = P4D_XB(i); return P4D_B(i); } + +struct p4d { + unsigned long long *xmap; + unsigned char *ex; + unsigned i,xb,cum[2]; + int oval,idx; +}; + +static inline void p4dini(struct p4d *p4d, unsigned char **__restrict__ pin, int n, unsigned *b) { unsigned char *in = *pin; + static unsigned long long xmap[2] = { 0 }; + + unsigned i = *(unsigned short *)in; + p4d->i = i; + *b = P4D_B(i); + p4d->xb = P4D_XB(i); + P4D_ININC(in,i); + *pin = in; + + p4d->ex = in + P4D_PAD8(n*(*b)); + p4d->xmap = (i&1)?p4d->ex:xmap; + p4d->ex += (i&1)?16:0; + p4d->cum[0] = 0; + p4d->cum[1] = popcnt64(p4d->xmap[0]); + p4d->oval = p4d->idx = -1; +} + +static ALWAYS_INLINE unsigned vp4dget32(struct p4d p4d, unsigned char *__restrict__ in, unsigned b, unsigned idx) { unsigned bi, cl, u = _bitgetx32(in, b, idx*b); + if(unlikely(p4d.xmap[bi = idx>>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<oval += vp4dget(*p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; } + diff --git a/vp4dd_.h b/vp4dd_.h new file mode 100644 index 0000000..f92ce5f --- /dev/null +++ b/vp4dd_.h @@ -0,0 +1,369 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd_.h - "Integer Compression" Turbo PforDelta +**/ + #ifdef __AVX2__ +#include + +static ALIGNED(unsigned char, shuffles[256][8], 32) = { + { 0,0,0,0,0,0,0,0 }, + { 0,1,1,1,1,1,1,1 }, + { 1,0,1,1,1,1,1,1 }, + { 0,1,2,2,2,2,2,2 }, + { 1,1,0,1,1,1,1,1 }, + { 0,2,1,2,2,2,2,2 }, + { 2,0,1,2,2,2,2,2 }, + { 0,1,2,3,3,3,3,3 }, + { 1,1,1,0,1,1,1,1 }, + { 0,2,2,1,2,2,2,2 }, + { 2,0,2,1,2,2,2,2 }, + { 0,1,3,2,3,3,3,3 }, + { 2,2,0,1,2,2,2,2 }, + { 0,3,1,2,3,3,3,3 }, + { 3,0,1,2,3,3,3,3 }, + { 0,1,2,3,4,4,4,4 }, + { 1,1,1,1,0,1,1,1 }, + { 0,2,2,2,1,2,2,2 }, + { 2,0,2,2,1,2,2,2 }, + { 0,1,3,3,2,3,3,3 }, + { 2,2,0,2,1,2,2,2 }, + { 0,3,1,3,2,3,3,3 }, + { 3,0,1,3,2,3,3,3 }, + { 0,1,2,4,3,4,4,4 }, + { 2,2,2,0,1,2,2,2 }, + { 0,3,3,1,2,3,3,3 }, + { 3,0,3,1,2,3,3,3 }, + { 0,1,4,2,3,4,4,4 }, + { 3,3,0,1,2,3,3,3 }, + { 0,4,1,2,3,4,4,4 }, + { 4,0,1,2,3,4,4,4 }, + { 0,1,2,3,4,5,5,5 }, + { 1,1,1,1,1,0,1,1 }, + { 0,2,2,2,2,1,2,2 }, + { 2,0,2,2,2,1,2,2 }, + { 0,1,3,3,3,2,3,3 }, + { 2,2,0,2,2,1,2,2 }, + { 0,3,1,3,3,2,3,3 }, + { 3,0,1,3,3,2,3,3 }, + { 0,1,2,4,4,3,4,4 }, + { 2,2,2,0,2,1,2,2 }, + { 0,3,3,1,3,2,3,3 }, + { 3,0,3,1,3,2,3,3 }, + { 0,1,4,2,4,3,4,4 }, + { 3,3,0,1,3,2,3,3 }, + { 0,4,1,2,4,3,4,4 }, + { 4,0,1,2,4,3,4,4 }, + { 0,1,2,3,5,4,5,5 }, + { 2,2,2,2,0,1,2,2 }, + { 0,3,3,3,1,2,3,3 }, + { 3,0,3,3,1,2,3,3 }, + { 0,1,4,4,2,3,4,4 }, + { 3,3,0,3,1,2,3,3 }, + { 0,4,1,4,2,3,4,4 }, + { 4,0,1,4,2,3,4,4 }, + { 0,1,2,5,3,4,5,5 }, + { 3,3,3,0,1,2,3,3 }, + { 0,4,4,1,2,3,4,4 }, + { 4,0,4,1,2,3,4,4 }, + { 0,1,5,2,3,4,5,5 }, + { 4,4,0,1,2,3,4,4 }, + { 0,5,1,2,3,4,5,5 }, + { 5,0,1,2,3,4,5,5 }, + { 0,1,2,3,4,5,6,6 }, + { 1,1,1,1,1,1,0,1 }, + { 0,2,2,2,2,2,1,2 }, + { 2,0,2,2,2,2,1,2 }, + { 0,1,3,3,3,3,2,3 }, + { 2,2,0,2,2,2,1,2 }, + { 0,3,1,3,3,3,2,3 }, + { 3,0,1,3,3,3,2,3 }, + { 0,1,2,4,4,4,3,4 }, + { 2,2,2,0,2,2,1,2 }, + { 0,3,3,1,3,3,2,3 }, + { 3,0,3,1,3,3,2,3 }, + { 0,1,4,2,4,4,3,4 }, + { 3,3,0,1,3,3,2,3 }, + { 0,4,1,2,4,4,3,4 }, + { 4,0,1,2,4,4,3,4 }, + { 0,1,2,3,5,5,4,5 }, + { 2,2,2,2,0,2,1,2 }, + { 0,3,3,3,1,3,2,3 }, + { 3,0,3,3,1,3,2,3 }, + { 0,1,4,4,2,4,3,4 }, + { 3,3,0,3,1,3,2,3 }, + { 0,4,1,4,2,4,3,4 }, + { 4,0,1,4,2,4,3,4 }, + { 0,1,2,5,3,5,4,5 }, + { 3,3,3,0,1,3,2,3 }, + { 0,4,4,1,2,4,3,4 }, + { 4,0,4,1,2,4,3,4 }, + { 0,1,5,2,3,5,4,5 }, + { 4,4,0,1,2,4,3,4 }, + { 0,5,1,2,3,5,4,5 }, + { 5,0,1,2,3,5,4,5 }, + { 0,1,2,3,4,6,5,6 }, + { 2,2,2,2,2,0,1,2 }, + { 0,3,3,3,3,1,2,3 }, + { 3,0,3,3,3,1,2,3 }, + { 0,1,4,4,4,2,3,4 }, + { 3,3,0,3,3,1,2,3 }, + { 0,4,1,4,4,2,3,4 }, + { 4,0,1,4,4,2,3,4 }, + { 0,1,2,5,5,3,4,5 }, + { 3,3,3,0,3,1,2,3 }, + { 0,4,4,1,4,2,3,4 }, + { 4,0,4,1,4,2,3,4 }, + { 0,1,5,2,5,3,4,5 }, + { 4,4,0,1,4,2,3,4 }, + { 0,5,1,2,5,3,4,5 }, + { 5,0,1,2,5,3,4,5 }, + { 0,1,2,3,6,4,5,6 }, + { 3,3,3,3,0,1,2,3 }, + { 0,4,4,4,1,2,3,4 }, + { 4,0,4,4,1,2,3,4 }, + { 0,1,5,5,2,3,4,5 }, + { 4,4,0,4,1,2,3,4 }, + { 0,5,1,5,2,3,4,5 }, + { 5,0,1,5,2,3,4,5 }, + { 0,1,2,6,3,4,5,6 }, + { 4,4,4,0,1,2,3,4 }, + { 0,5,5,1,2,3,4,5 }, + { 5,0,5,1,2,3,4,5 }, + { 0,1,6,2,3,4,5,6 }, + { 5,5,0,1,2,3,4,5 }, + { 0,6,1,2,3,4,5,6 }, + { 6,0,1,2,3,4,5,6 }, + { 0,1,2,3,4,5,6,7 }, + { 1,1,1,1,1,1,1,0 }, + { 0,2,2,2,2,2,2,1 }, + { 2,0,2,2,2,2,2,1 }, + { 0,1,3,3,3,3,3,2 }, + { 2,2,0,2,2,2,2,1 }, + { 0,3,1,3,3,3,3,2 }, + { 3,0,1,3,3,3,3,2 }, + { 0,1,2,4,4,4,4,3 }, + { 2,2,2,0,2,2,2,1 }, + { 0,3,3,1,3,3,3,2 }, + { 3,0,3,1,3,3,3,2 }, + { 0,1,4,2,4,4,4,3 }, + { 3,3,0,1,3,3,3,2 }, + { 0,4,1,2,4,4,4,3 }, + { 4,0,1,2,4,4,4,3 }, + { 0,1,2,3,5,5,5,4 }, + { 2,2,2,2,0,2,2,1 }, + { 0,3,3,3,1,3,3,2 }, + { 3,0,3,3,1,3,3,2 }, + { 0,1,4,4,2,4,4,3 }, + { 3,3,0,3,1,3,3,2 }, + { 0,4,1,4,2,4,4,3 }, + { 4,0,1,4,2,4,4,3 }, + { 0,1,2,5,3,5,5,4 }, + { 3,3,3,0,1,3,3,2 }, + { 0,4,4,1,2,4,4,3 }, + { 4,0,4,1,2,4,4,3 }, + { 0,1,5,2,3,5,5,4 }, + { 4,4,0,1,2,4,4,3 }, + { 0,5,1,2,3,5,5,4 }, + { 5,0,1,2,3,5,5,4 }, + { 0,1,2,3,4,6,6,5 }, + { 2,2,2,2,2,0,2,1 }, + { 0,3,3,3,3,1,3,2 }, + { 3,0,3,3,3,1,3,2 }, + { 0,1,4,4,4,2,4,3 }, + { 3,3,0,3,3,1,3,2 }, + { 0,4,1,4,4,2,4,3 }, + { 4,0,1,4,4,2,4,3 }, + { 0,1,2,5,5,3,5,4 }, + { 3,3,3,0,3,1,3,2 }, + { 0,4,4,1,4,2,4,3 }, + { 4,0,4,1,4,2,4,3 }, + { 0,1,5,2,5,3,5,4 }, + { 4,4,0,1,4,2,4,3 }, + { 0,5,1,2,5,3,5,4 }, + { 5,0,1,2,5,3,5,4 }, + { 0,1,2,3,6,4,6,5 }, + { 3,3,3,3,0,1,3,2 }, + { 0,4,4,4,1,2,4,3 }, + { 4,0,4,4,1,2,4,3 }, + { 0,1,5,5,2,3,5,4 }, + { 4,4,0,4,1,2,4,3 }, + { 0,5,1,5,2,3,5,4 }, + { 5,0,1,5,2,3,5,4 }, + { 0,1,2,6,3,4,6,5 }, + { 4,4,4,0,1,2,4,3 }, + { 0,5,5,1,2,3,5,4 }, + { 5,0,5,1,2,3,5,4 }, + { 0,1,6,2,3,4,6,5 }, + { 5,5,0,1,2,3,5,4 }, + { 0,6,1,2,3,4,6,5 }, + { 6,0,1,2,3,4,6,5 }, + { 0,1,2,3,4,5,7,6 }, + { 2,2,2,2,2,2,0,1 }, + { 0,3,3,3,3,3,1,2 }, + { 3,0,3,3,3,3,1,2 }, + { 0,1,4,4,4,4,2,3 }, + { 3,3,0,3,3,3,1,2 }, + { 0,4,1,4,4,4,2,3 }, + { 4,0,1,4,4,4,2,3 }, + { 0,1,2,5,5,5,3,4 }, + { 3,3,3,0,3,3,1,2 }, + { 0,4,4,1,4,4,2,3 }, + { 4,0,4,1,4,4,2,3 }, + { 0,1,5,2,5,5,3,4 }, + { 4,4,0,1,4,4,2,3 }, + { 0,5,1,2,5,5,3,4 }, + { 5,0,1,2,5,5,3,4 }, + { 0,1,2,3,6,6,4,5 }, + { 3,3,3,3,0,3,1,2 }, + { 0,4,4,4,1,4,2,3 }, + { 4,0,4,4,1,4,2,3 }, + { 0,1,5,5,2,5,3,4 }, + { 4,4,0,4,1,4,2,3 }, + { 0,5,1,5,2,5,3,4 }, + { 5,0,1,5,2,5,3,4 }, + { 0,1,2,6,3,6,4,5 }, + { 4,4,4,0,1,4,2,3 }, + { 0,5,5,1,2,5,3,4 }, + { 5,0,5,1,2,5,3,4 }, + { 0,1,6,2,3,6,4,5 }, + { 5,5,0,1,2,5,3,4 }, + { 0,6,1,2,3,6,4,5 }, + { 6,0,1,2,3,6,4,5 }, + { 0,1,2,3,4,7,5,6 }, + { 3,3,3,3,3,0,1,2 }, + { 0,4,4,4,4,1,2,3 }, + { 4,0,4,4,4,1,2,3 }, + { 0,1,5,5,5,2,3,4 }, + { 4,4,0,4,4,1,2,3 }, + { 0,5,1,5,5,2,3,4 }, + { 5,0,1,5,5,2,3,4 }, + { 0,1,2,6,6,3,4,5 }, + { 4,4,4,0,4,1,2,3 }, + { 0,5,5,1,5,2,3,4 }, + { 5,0,5,1,5,2,3,4 }, + { 0,1,6,2,6,3,4,5 }, + { 5,5,0,1,5,2,3,4 }, + { 0,6,1,2,6,3,4,5 }, + { 6,0,1,2,6,3,4,5 }, + { 0,1,2,3,7,4,5,6 }, + { 4,4,4,4,0,1,2,3 }, + { 0,5,5,5,1,2,3,4 }, + { 5,0,5,5,1,2,3,4 }, + { 0,1,6,6,2,3,4,5 }, + { 5,5,0,5,1,2,3,4 }, + { 0,6,1,6,2,3,4,5 }, + { 6,0,1,6,2,3,4,5 }, + { 0,1,2,7,3,4,5,6 }, + { 5,5,5,0,1,2,3,4 }, + { 0,6,6,1,2,3,4,5 }, + { 6,0,6,1,2,3,4,5 }, + { 0,1,7,2,3,4,5,6 }, + { 6,6,0,1,2,3,4,5 }, + { 0,7,1,2,3,4,5,6 }, + { 7,0,1,2,3,4,5,6 }, + { 0,1,2,3,4,5,6,7, } + }; + + #elif defined(__SSE4_1__) +#include + #endif + +#define uint_t TEMPLATE3(uint, USIZE, _t) + +unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict__ in, int n, uint_t *__restrict__ out) { + uint_t ex[0x100+8]; unsigned i = *(unsigned short *)in; uint_t b = P4D_B(i); unsigned xb = P4D_XB(i); + P4D_ININC(in,i); + in = TEMPLATE2(bitunpack, USIZE)(in, n, b, out); + if(i & 1) { + unsigned long long b0 = *(unsigned long long *)in; in += 8; unsigned long long b1 = *(unsigned long long *)in; in += 8; + in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(b0) + popcnt64(b1), xb, ex); + #ifdef __AVX2__ + unsigned *op,*pex = ex; + for(op = out; b0; b0 >>= 8,op += 8) { const unsigned m = (unsigned char)b0, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } + for(op = out+64; b1; b1 >>= 8,op += 8) { const unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } + #elif defined(__SSE4_1__) + + static ALIGNED(char, shuffles[16][16], 16) = { + #define _ 0x80 + { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, + { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, + { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, + { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, + #undef _ + }; + unsigned *op,*pex = ex; + for(op = out; b0; b0 >>= 4,op+=4) { const unsigned m = b0&0xf; + _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); + } + for(op=out+64; b1; b1 >>= 4,op+=4) { const unsigned m = b1&0xf; + _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); + } + #else + unsigned k = 0; + while(b0) { unsigned x = ctzll(b0); out[x] += ex[k++]<i&1)?(p4d->xmap+2):p4d->in+ PAD8(n*xb); +} + #endif diff --git a/vsimple.c b/vsimple.c new file mode 100644 index 0000000..f8bff77 --- /dev/null +++ b/vsimple.c @@ -0,0 +1,42 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vsimple.c - "Integer Compression" variable simple +**/ + +#include "vsimple.h" + +#define USE_RLE + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 +#define SV_LIM unsigned char s_lim[] = { 0, 28, 28, 28, 28, 36, 36, 36, 36, 36, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 0 }; +#define SV_ITM unsigned s_itm[] = { -1, 28, 14, 9, 7, 7, 6, 5, 4, 4, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, -1 } +static SV_ITM; +static SV_LIM; + +#include +#define USIZE 32 +#include "vsimple_.h" + +#define USIZE 16 +#include "vsimple_.h" + diff --git a/vsimple.h b/vsimple.h new file mode 100644 index 0000000..b1684f4 --- /dev/null +++ b/vsimple.h @@ -0,0 +1,42 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vsimple.h - "Integer Compression" variable simple +**/ + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned char *vsenc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *vsdec32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); + +unsigned char *vsenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *vsdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out); + +#ifdef __cplusplus +} +#endif + + + diff --git a/vsimple_.h b/vsimple_.h new file mode 100644 index 0000000..59f1dbe --- /dev/null +++ b/vsimple_.h @@ -0,0 +1,396 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vsimple_.h - "Integer Compression" variable simple +**/ + +#include "vint.h" +#define uint_t TEMPLATE3(uint, USIZE, _t) + +unsigned char *TEMPLATE2(vsenc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ op) { + unsigned xm,m,r; + uint_t *e = in+n,*ip; + for(ip = in; ip < e; ) { + #ifdef USE_RLE + if(ip < e-4 && *ip == *(ip+1)) { uint_t *q = ip+1; while(q < e-1 && *(q+1) == *ip) q++; r = q - ip; + if(r*TEMPLATE2(bsr, USIZE)(*ip) > 16 || !*ip && r>4) { m = (*ip)?33:0; goto a; } + } else + #endif + r = 0; unsigned x = m = bsr32(*ip); + while((r+1)*(xm = x > m?x:m) <= s_lim[xm]) { m = xm; x = TEMPLATE2(bsr, USIZE)(*(ip+(++r))); } + if(/*xm != 32 &&*/ m) while(r < s_itm[m]) m++; + a:; + switch(m) { + case 0: ip += r; + if(--r >= 0xf) { + *op++ = 0xf0; + if(n <= 0x100) + *op++ = r; + else + vbput(op, r); + } else *op++ = r<<4; + break; + case 1: + *(unsigned *)op = 1 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 5 | + (unsigned)ip[ 2] << 6 | + (unsigned)ip[ 3] << 7 | + (unsigned)ip[ 4] << 8 | + (unsigned)ip[ 5] << 9 | + (unsigned)ip[ 6] << 10 | + (unsigned)ip[ 7] << 11 | + (unsigned)ip[ 8] << 12 | + (unsigned)ip[ 9] << 13 | + (unsigned)ip[10] << 14 | + (unsigned)ip[11] << 15 | + (unsigned)ip[12] << 16 | + (unsigned)ip[13] << 17 | + (unsigned)ip[14] << 18 | + (unsigned)ip[15] << 19 | + (unsigned)ip[16] << 20 | + (unsigned)ip[17] << 21 | + (unsigned)ip[18] << 22 | + (unsigned)ip[19] << 23 | + (unsigned)ip[20] << 24 | + (unsigned)ip[21] << 25 | + (unsigned)ip[22] << 26 | + (unsigned)ip[23] << 27 | + (unsigned)ip[24] << 28 | + (unsigned)ip[25] << 29 | + (unsigned)ip[26] << 30 | + (unsigned)ip[27] << 31; ip += 28; op += 4; + break; + case 2: + *(unsigned *)op = 2 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 6 | + (unsigned)ip[ 2] << 8 | + (unsigned)ip[ 3] << 10 | + (unsigned)ip[ 4] << 12 | + (unsigned)ip[ 5] << 14 | + (unsigned)ip[ 6] << 16 | + (unsigned)ip[ 7] << 18 | + (unsigned)ip[ 8] << 20 | + (unsigned)ip[ 9] << 22 | + (unsigned)ip[10] << 24 | + (unsigned)ip[11] << 26 | + (unsigned)ip[12] << 28 | + (unsigned)ip[13] << 30; ip += 14; op += 4; + break; + case 3: + *(unsigned *)op = 3 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 7 | + (unsigned)ip[ 2] << 10 | + (unsigned)ip[ 3] << 13 | + (unsigned)ip[ 4] << 16 | + (unsigned)ip[ 5] << 19 | + (unsigned)ip[ 6] << 22 | + (unsigned)ip[ 7] << 25 | + (unsigned)ip[ 8] << 28; ip += 9; op += 4; + break; + case 4: + *(uint64_t *)op = 4 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 8 | + (unsigned)ip[ 2] << 12 | + (unsigned)ip[ 3] << 16 | + (unsigned)ip[ 4] << 20 | + (unsigned)ip[ 5] << 24 | + (unsigned)ip[ 6] << 28; ip += 7; op += 4; + break; + case 5: + *(uint64_t *)op = 5 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 9 | + (unsigned)ip[ 2] << 14 | + (unsigned)ip[ 3] << 19 | + (unsigned)ip[ 4] << 24 | + (uint64_t)ip[ 5] << 29 | + (uint64_t)ip[ 6] << 34; ip += 7; op += 5; + break; + case 6: + *(uint64_t *)op = 6 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 10 | + (unsigned)ip[ 2] << 16 | + (unsigned)ip[ 3] << 22 | + (uint64_t)ip[ 4] << 28 | + (uint64_t)ip[ 5] << 34; ip += 6; op += 5; + break; + case 7: + *(uint64_t *)op = 7 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 11 | + (unsigned)ip[ 2] << 18 | + (uint64_t)ip[ 3] << 25 | + (uint64_t)ip[ 4] << 32; ip += 5; op += 5; + break; + case 8: + case 9: + *(uint64_t *)op = 9 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 13 | + (unsigned)ip[ 2] << 22 | + (uint64_t)ip[ 3] << 31; ip += 4; op += 5; + break; + case 10: + *(uint64_t *)op = 10 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 14 | + (uint64_t)ip[ 2] << 24 | + (uint64_t)ip[ 3] << 34 | + (uint64_t)ip[ 4] << 44 | + (uint64_t)ip[ 5] << 54; ip += 6; op += 8; + break; + + case 11: + case 12: + *(uint64_t *)op = 12 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 16 | + (uint64_t)ip[ 2] << 28 | + (uint64_t)ip[ 3] << 40 | + (uint64_t)ip[ 4] << 52; ip += 5; op += 8; + break; + case 13: + case 14: + case 15: + *(uint64_t *)op = 15 | + (unsigned)ip[ 0] << 4 | + (uint64_t)ip[ 1] << 19 | + (uint64_t)ip[ 2] << 34 | + (uint64_t)ip[ 3] << 49; ip += 4; op += 8; + break; + case 16: + case 17: + case 18: + case 19: + case 20: + *(uint64_t *)op = 11 | + (unsigned)ip[ 0] << 4 | + (uint64_t)ip[ 1] << 24 | + (uint64_t)ip[ 2] << 44; ip += 3; op += 8; + break; + case 21: + case 22: + case 23: + case 24: + case 25: + case 26: + case 27: + case 28: + case 29: + case 30: + *(uint64_t *)op = 13 | + (unsigned)ip[ 0] << 4 | + (uint64_t)ip[ 1] << 34; ip += 2; op += 8; + break; + case 31: + case 32: + *(uint64_t *)op = 14 | + (uint64_t)ip[ 0] << 4; ip++; op += 5; + break; + #ifdef USE_RLE + case 33: ip += r; + if(--r >= 0xf) { + *op++ = 0xf0|8; + if(n <= 0x100) + *op++ = r; + else + vbput(op, r); + } else *op++ = r<<4|8; + vbput(op, ip[0]); + break; + #endif + } + } + return op; +} + +#define OP(__x) op[__x] // *op++ // +#define OPI(__x) op+=__x// // + +unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, uint_t *__restrict__ op) { unsigned *op_=op+n; + while(op < op_) { register uint64_t w=*(uint64_t *)ip; + switch(w & 15) { + case 0: { + int r = (w>>4)&0xf; ip++; + if(unlikely(r == 0xf)) { + if(n <= 0x100) + r = (w>>8)&0xff, ip++; + else + r = vbget(ip); + } + uint_t *q=op; op+=r+1; while(q=0) *op++=0; + } break; + case 1: + OP( 0) = (w >> 4) & 1; + OP( 1) = (w >> 5) & 1; + OP( 2) = (w >> 6) & 1; + OP( 3) = (w >> 7) & 1; + OP( 4) = (w >> 8) & 1; + OP( 5) = (w >> 9) & 1; + OP( 6) = (w >> 10) & 1; + OP( 7) = (w >> 11) & 1; + OP( 8) = (w >> 12) & 1; + OP( 9) = (w >> 13) & 1; + OP(10) = (w >> 14) & 1; + OP(11) = (w >> 15) & 1; + OP(12) = (w >> 16) & 1; + OP(13) = (w >> 17) & 1; + OP(14) = (w >> 18) & 1; + OP(15) = (w >> 19) & 1; + OP(16) = (w >> 20) & 1; + OP(17) = (w >> 21) & 1; + OP(18) = (w >> 22) & 1; + OP(19) = (w >> 23) & 1; + OP(20) = (w >> 24) & 1; + OP(21) = (w >> 25) & 1; + OP(22) = (w >> 26) & 1; + OP(23) = (w >> 27) & 1; + OP(24) = (w >> 28) & 1; + OP(25) = (w >> 29) & 1; + OP(26) = (w >> 30) & 1; + OP(27) = (w >> 31) & 1; OPI( 28); ip+=4; + break; + case 2: + OP( 0) = (w >> 4) & 3; + OP( 1) = (w >> 6) & 3; + OP( 2) = (w >> 8) & 3; + OP( 3) = (w >> 10) & 3; + OP( 4) = (w >> 12) & 3; + OP( 5) = (w >> 14) & 3; + OP( 6) = (w >> 16) & 3; + OP( 7) = (w >> 18) & 3; + OP( 8) = (w >> 20) & 3; + OP( 9) = (w >> 22) & 3; + OP(10) = (w >> 24) & 3; + OP(11) = (w >> 26) & 3; + OP(12) = (w >> 28) & 3; + OP(13) = (w >> 30) & 3; OPI( 14); ip+=4; + break; + case 3: + OP( 0) = (w >> 4) & 7; + OP( 1) = (w >> 7) & 7; + OP( 2) = (w >> 10) & 7; + OP( 3) = (w >> 13) & 7; + OP( 4) = (w >> 16) & 7; + OP( 5) = (w >> 19) & 7; + OP( 6) = (w >> 22) & 7; + OP( 7) = (w >> 25) & 7; + OP( 8) = (w >> 28) & 7; OPI( 9); ip+=4; + break; + case 4: + OP( 0) = (w >> 4) & 0xf; + OP( 1) = (w >> 8) & 0xf; + OP( 2) = (w >> 12) & 0xf; + OP( 3) = (w >> 16) & 0xf; + OP( 4) = (w >> 20) & 0xf; + OP( 5) = (w >> 24) & 0xf; + OP( 6) = (w >> 28) & 0xf; OPI( 7); ip+=4; + break; + case 5: + OP( 0) = (w >> 4) & 0x1f; + OP( 1) = (w >> 9) & 0x1f; + OP( 2) = (w >> 14) & 0x1f; + OP( 3) = (w >> 19) & 0x1f; + OP( 4) = (w >> 24) & 0x1f; + OP( 5) = (w >> 29) & 0x1f; + OP( 6) = (w >> 34) & 0x1f; OPI( 7); ip+=5; + break; + case 6: + OP(0) = (w >> 4) & 0x3f; + OP(1) = (w >> 10) & 0x3f; + OP(2) = (w >> 16) & 0x3f; + OP(3) = (w >> 22) & 0x3f; + OP(4) = (w >> 28) & 0x3f; + OP(5) = (w >> 34) & 0x3f; OPI( 6); ip+=5; + break; + + case 7: + OP(0) = (w >> 4) & 0x7f; + OP(1) = (w >> 11) & 0x7f; + OP(2) = (w >> 18) & 0x7f; + OP(3) = (w >> 25) & 0x7f; + OP(4) = (w >> 32) & 0x7f; OPI( 5); ip+=5; + break; + + #ifdef USE_RLE + case 8: { + int r = (w>>4)&0xf; ip++; + if(unlikely(r == 0xf)) { + if(n <= 0x100) + r = (w>>8)&0xff, ip++; + else + r = vbget(ip); + } + unsigned u = vbget(ip); uint_t *q=op; op+=r+1; while(q=0) *op++=u; + } break; + #endif + case 9: + OP(0) = (w >> 4) & 0x1ff; + OP(1) = (w >> 13) & 0x1ff; + OP(2) = (w >> 22) & 0x1ff; + OP(3) = (w >> 31) & 0x1ff; OPI( 4); ip+=5; + break; + + case 10: + OP(0) = (w >> 4) & 0x3ff; + OP(1) = (w >> 14) & 0x3ff; + OP(2) = (w >> 24) & 0x3ff; + OP(3) = (w >> 34) & 0x3ff; + OP(4) = (w >> 44) & 0x3ff; + OP(5) = (w >> 54) & 0x3ff; OPI( 6); ip+=8; + break; + case 12: + OP(0) = (w >> 4) & 0xfff; + OP(1) = (w >> 16) & 0xfff; + OP(2) = (w >> 28) & 0xfff; + OP(3) = (w >> 40) & 0xfff; + OP(4) = (w >> 52) & 0xfff; OPI( 5); ip+=8; + break; + case 15: + OP(0) = (w >> 4) & 0x7fff; + OP(1) = (w >> 19) & 0x7fff; + OP(2) = (w >> 34) & 0x7fff; + OP(3) = (w >> 49) & 0x7fff; OPI( 4); ip+=8; + break; + case 11: + OP(0) = (w >> 4) & 0xfffff; // 20 + OP(1) = (w >> 24) & 0xfffff; + OP(2) = (w >> 44) & 0xfffff; OPI( 3); ip+=8; + break; + case 13: + OP(0) = (w >> 4) & ((1<<30)-1); + OP(1) = (w >> 34) & ((1<<30)-1); OPI( 2); ip+=8; + break; + case 14: + OP(0) = (w >> 4) & ((1ull<<32)-1); OPI( 1); ip+=5; + break; + } + } + return ip; +} +