commit b1bd8dcf23ee10f052af44a8aea466914497a61d Author: powturbo Date: Tue Oct 28 22:19:48 2014 +0100 Initial commit diff --git a/README.md b/README.md new file mode 100644 index 0000000..c92045a --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +TurboPFor: Fastest Integer Compression +====================================== + +- 100% C, without inline assembly +

+- Fastest **"Variable Byte"** implementation +

+- Novel **"Variable Simple"** faster than simple16 and more compact than simple64 +

+- Scalar **"Binary Packing"** with bulk decoding as fast as SIMD FastPFor in realistic (No "pure cache") scenarios +- Binary Packing with **Direct/Random Access** without decompressing entire blocks +- Access any single binary packed entry with **zero decompression** +

+- Novel **"TurboPFor"** (Patched Frame-of-Reference) scheme with direct access or bulk decoding +

+- Several times faster than other libraries +- Usage as easy as memcpy +- Instant access to compressed *frequency* and *position* data in inverted index with zero decoding + +# Benchmark: +i7-2600k at 3.4GHz, gcc 4.9, ubuntu 14.10. +- Single thread +- Realistic and practical benchmark with large integer arrays. +- No PURE cache benchmark + +#### Synthetic data: +coming soon! + +#### data files + - clueweb09.sorted from FastPFor (http://lemire.me/data/integercompression2014.html)
+ ./icbench -n10000000000 clueweb09.sorted + + + + + + + + + + + + + +
SizeRatio in %Bits/IntegerC Time MB/sD Time MB/sFunction
5144384058.162.61357.221286.42TurboPFor
5144384058.162.61358.09309.70TurboPFor DA
5398417928.562.746.47767.35OptP4
5831841129.252.96132.42914.89Simple16
6235485659.893.17235.32925.71SimpleV
73336595211.643.72162.211312.15Simple64
86246428913.684.381274.011980.55TurboPack
86246428913.684.381285.28868.06TurboPack DA
86246539113.684.381402.122075.15SIMD-BitPack FPF
6303089028100.0032.001257.501308.22copy
+ +## Compile: + make + +## Usage +###### Synthetic data: + 1. test all functions + ./icbench -a1.0 -m0 -x8 -n100000000 + + - zipfian distribution alpha = 1.0 (Ex. -a1.0=uniform -a1.5=skewed distribution) + - number of integers = 100000000 + - integer range from 0 to 255 (integer size = 0 to 8 bits) + + 2. individual function test (ex. copy TurboPack TurboPack Direct access) + ./icbench -a1.0 -m0 -x8 -ecopy/turbopack/turbopack,da -n100000000 + +###### Data files: + - Data file Benchmark (file format as in FastPFOR) + ./icbench -n10000000000 clueweb09.sorted + +## Reference: + - "SIMD-BitPack FPF" from FastPFor https://github.com/lemire/simdcomp + - OptP4 and Simple-16 from http://jinruhe.com/ + diff --git a/aux/OPT_PFD/main.cpp b/aux/OPT_PFD/main.cpp new file mode 100644 index 0000000..2c0ec06 --- /dev/null +++ b/aux/OPT_PFD/main.cpp @@ -0,0 +1,101 @@ +/* + * test for OPT-pfd + * + * Author: sding + * + * + */ + + + +#include +#include +#include + +#include "opt_p4.h" + +using namespace std; + +char PATH[128] = "/usr/home/shuai/dumplist/wordlist_Excite"; // for reading list + +int get_list(char *term, unsigned int *doc_id, unsigned int *freq, unsigned int *maxc) +{ + char fpath[128]; + sprintf(fpath,"%s/%s",PATH,term); + FILE *fdd = fopen(fpath,"r"); + if(fdd==NULL) return 0; + + int nread, npos; + + nread = fread(&npos, sizeof(unsigned), 1, fdd); + npos = 0; + + while (nread > 0) + { + nread = fread(&doc_id[npos], sizeof(unsigned), 1, fdd); + if (nread <= 0) break; + fread(&freq[npos], sizeof(unsigned), 1, fdd); + npos++; + } + fclose(fdd); + + int i; + + /* fill out the max values */ + for (i = 0; i < npos; i += BS) + maxc[(i/BS)] = doc_id[i+BS-1]; + + /* take the gap for doc_id */ + for (i = npos-1; i > 0; i--) + { + doc_id[i] -= doc_id[i-1]; + doc_id[i] --; + } + + for (i = 0; i < npos; i++) + freq[i]--; + return npos; +} + +int main() // just for testing +{ + int MAX_NDOC = 25205179; + unsigned int *docid = new unsigned int[MAX_NDOC]; + unsigned int *docid_check = new unsigned int[MAX_NDOC ]; + + unsigned int *fre = new unsigned int[MAX_NDOC]; + unsigned int *maxc = new unsigned int[MAX_NDOC/BS]; + unsigned int *aux = new unsigned int[MAX_NDOC]; + unsigned int * all_array = new unsigned int[2048]; // extra array for coding + + + int listSize = get_list("information", docid, fre, maxc); + cout<<"list size is "< size * 4) // int bytes + { + chunk_size = size *4; + b = l; + temp_en = ex_n; + } + } + + csize += chunk_size; + //printf("encode:%u\n", b); + p4_encode(doc_id + j, BS, b, aux + offset, &size, &ex_n); + offset += size; + } + + return csize; +} diff --git a/aux/OPT_PFD/pf.h b/aux/OPT_PFD/pf.h new file mode 100644 index 0000000..788f8cc --- /dev/null +++ b/aux/OPT_PFD/pf.h @@ -0,0 +1,158 @@ +#include "s16head.h" +#include "unpack.h" + + +#define BS 128 +#define FRAC 0.10 +#define S 16 +#define PCHUNK 128 + +void pack(unsigned int *v, unsigned int b, unsigned int n, unsigned int *w); + + +int detailed_p4_encode(unsigned int **w, unsigned int* p, int num , int *chunk_size, int * exception_n) +{ + int i, j, t, s; + + unsigned int b = cnum[num]; + int bb_e; + int bb_p; + int p_low; + unsigned int e_n = 0; + int max_p = 0; + int max_e = 0; + + unsigned int* out = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + unsigned int* ex = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + unsigned int* po = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*2); + + unsigned int* tp = NULL; + unsigned int *_pp, *_ww; + + if (b == 32) + { + (*w)[0] = ((b<<10)) + (0); + *w +=1; + for (i = 0; i < PCHUNK ; i++) (*w)[i] = p[i]; + *w += (PCHUNK); + (*chunk_size) = 1 + BS; + + free(out); + free(ex); + free(po); + return 0; + } + + for (i = 0; i < PCHUNK ; i++) + { + if ( p[i] >= (1<> b); + po[(e_n++)] = i; // + } + else + out[i] = p[i]; + } + + if (1) // force to pass every time + { + /*get the gap of position*/ + for(j = e_n-1;j>0;j--) + { + po[j] = po[j] - po[j-1] ; + po[j] --; + } + + s = ((b * PCHUNK)>>5); + tp = (*w); + (*w)[0] = ((num<<10))+e_n; // record b and number of exceptions into this value, in the other version we pick this value out and did not count it + (*w) += 1; + for (i = 0; i < s; i++) (*w)[i] = 0; + pack(out, b, PCHUNK , *w); + *w += s; + + unsigned int *all_array = (unsigned*)malloc(sizeof(unsigned)*PCHUNK*4) ; + for(j=0;j>5; + s = 32 - b - (bp & 31); + if (s >= 0) + w[wp] |= (v[i]<>s); + w[wp+1] = (v[i]<<(32-s)); + } + } +} + +/*modified p4decode */ +unsigned int *detailed_p4_decode(unsigned int *_p, unsigned int *_w, unsigned int * all_array) +{ + + int i, s; + unsigned int x; + int flag = _w[0]; + (_w)++; + + unsigned int *_ww,*_pp; + unsigned int b = ((flag>>10) & 31); + unsigned int e_n = (flag & 1023) ; + + (unpack[b])(_p, _w); + + b = cnum[b]; + _w += ((b * BS)>>5); + unsigned int _k = 0; + unsigned int psum = 0; + if(e_n != 0 ) + { + for (_pp = all_array, _ww = (unsigned int *)(_w); _pp < &(all_array[e_n*2]);) + { + S16_DECODE(_ww, _pp); + } + + _w += (_ww - _w); + psum = all_array[0]; + + for(i=0;i>28; \ + switch(_k) \ + { \ + case 0: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 1; _p++; \ + *_p = (*_w>>8) & 1; _p++; \ + *_p = (*_w>>9) & 1; _p++; \ + *_p = (*_w>>10) & 1; _p++; \ + *_p = (*_w>>11) & 1; _p++; \ + *_p = (*_w>>12) & 1; _p++; \ + *_p = (*_w>>13) & 1; _p++; \ + *_p = (*_w>>14) & 1; _p++; \ + *_p = (*_w>>15) & 1; _p++; \ + *_p = (*_w>>16) & 1; _p++; \ + *_p = (*_w>>17) & 1; _p++; \ + *_p = (*_w>>18) & 1; _p++; \ + *_p = (*_w>>19) & 1; _p++; \ + *_p = (*_w>>20) & 1; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 1: \ + *_p = (*_w) & 3; _p++; \ + *_p = (*_w>>2) & 3; _p++; \ + *_p = (*_w>>4) & 3; _p++; \ + *_p = (*_w>>6) & 3; _p++; \ + *_p = (*_w>>8) & 3; _p++; \ + *_p = (*_w>>10) & 3; _p++; \ + *_p = (*_w>>12) & 3; _p++; \ + *_p = (*_w>>14) & 1; _p++; \ + *_p = (*_w>>15) & 1; _p++; \ + *_p = (*_w>>16) & 1; _p++; \ + *_p = (*_w>>17) & 1; _p++; \ + *_p = (*_w>>18) & 1; _p++; \ + *_p = (*_w>>19) & 1; _p++; \ + *_p = (*_w>>20) & 1; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 2: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 3; _p++; \ + *_p = (*_w>>9) & 3; _p++; \ + *_p = (*_w>>11) & 3; _p++; \ + *_p = (*_w>>13) & 3; _p++; \ + *_p = (*_w>>15) & 3; _p++; \ + *_p = (*_w>>17) & 3; _p++; \ + *_p = (*_w>>19) & 3; _p++; \ + *_p = (*_w>>21) & 1; _p++; \ + *_p = (*_w>>22) & 1; _p++; \ + *_p = (*_w>>23) & 1; _p++; \ + *_p = (*_w>>24) & 1; _p++; \ + *_p = (*_w>>25) & 1; _p++; \ + *_p = (*_w>>26) & 1; _p++; \ + *_p = (*_w>>27) & 1; _p++; \ + break; \ + case 3: \ + *_p = (*_w) & 1; _p++; \ + *_p = (*_w>>1) & 1; _p++; \ + *_p = (*_w>>2) & 1; _p++; \ + *_p = (*_w>>3) & 1; _p++; \ + *_p = (*_w>>4) & 1; _p++; \ + *_p = (*_w>>5) & 1; _p++; \ + *_p = (*_w>>6) & 1; _p++; \ + *_p = (*_w>>7) & 1; _p++; \ + *_p = (*_w>>8) & 1; _p++; \ + *_p = (*_w>>9) & 1; _p++; \ + *_p = (*_w>>10) & 1; _p++; \ + *_p = (*_w>>11) & 1; _p++; \ + *_p = (*_w>>12) & 1; _p++; \ + *_p = (*_w>>13) & 1; _p++; \ + *_p = (*_w>>14) & 3; _p++; \ + *_p = (*_w>>16) & 3; _p++; \ + *_p = (*_w>>18) & 3; _p++; \ + *_p = (*_w>>20) & 3; _p++; \ + *_p = (*_w>>22) & 3; _p++; \ + *_p = (*_w>>24) & 3; _p++; \ + *_p = (*_w>>26) & 3; _p++; \ + break; \ + case 4: \ + *_p = (*_w) & 3; _p++; \ + *_p = (*_w>>2) & 3; _p++; \ + *_p = (*_w>>4) & 3; _p++; \ + *_p = (*_w>>6) & 3; _p++; \ + *_p = (*_w>>8) & 3; _p++; \ + *_p = (*_w>>10) & 3; _p++; \ + *_p = (*_w>>12) & 3; _p++; \ + *_p = (*_w>>14) & 3; _p++; \ + *_p = (*_w>>16) & 3; _p++; \ + *_p = (*_w>>18) & 3; _p++; \ + *_p = (*_w>>20) & 3; _p++; \ + *_p = (*_w>>22) & 3; _p++; \ + *_p = (*_w>>24) & 3; _p++; \ + *_p = (*_w>>26) & 3; _p++; \ + break; \ + case 5: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 7; _p++; \ + *_p = (*_w>>7) & 7; _p++; \ + *_p = (*_w>>10) & 7; _p++; \ + *_p = (*_w>>13) & 7; _p++; \ + *_p = (*_w>>16) & 7; _p++; \ + *_p = (*_w>>19) & 7; _p++; \ + *_p = (*_w>>22) & 7; _p++; \ + *_p = (*_w>>25) & 7; _p++; \ + break; \ + case 6: \ + *_p = (*_w) & 7; _p++; \ + *_p = (*_w>>3) & 15; _p++; \ + *_p = (*_w>>7) & 15; _p++; \ + *_p = (*_w>>11) & 15; _p++; \ + *_p = (*_w>>15) & 15; _p++; \ + *_p = (*_w>>19) & 7; _p++; \ + *_p = (*_w>>22) & 7; _p++; \ + *_p = (*_w>>25) & 7; _p++; \ + break; \ + case 7: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 15; _p++; \ + *_p = (*_w>>8) & 15; _p++; \ + *_p = (*_w>>12) & 15; _p++; \ + *_p = (*_w>>16) & 15; _p++; \ + *_p = (*_w>>20) & 15; _p++; \ + *_p = (*_w>>24) & 15; _p++; \ + break; \ + case 8: \ + *_p = (*_w) & 31; _p++; \ + *_p = (*_w>>5) & 31; _p++; \ + *_p = (*_w>>10) & 31; _p++; \ + *_p = (*_w>>15) & 31; _p++; \ + *_p = (*_w>>20) & 15; _p++; \ + *_p = (*_w>>24) & 15; _p++; \ + break; \ + case 9: \ + *_p = (*_w) & 15; _p++; \ + *_p = (*_w>>4) & 15; _p++; \ + *_p = (*_w>>8) & 31; _p++; \ + *_p = (*_w>>13) & 31; _p++; \ + *_p = (*_w>>18) & 31; _p++; \ + *_p = (*_w>>23) & 31; _p++; \ + break; \ + case 10: \ + *_p = (*_w) & 63; _p++; \ + *_p = (*_w>>6) & 63; _p++; \ + *_p = (*_w>>12) & 63; _p++; \ + *_p = (*_w>>18) & 31; _p++; \ + *_p = (*_w>>23) & 31; _p++; \ + break; \ + case 11: \ + *_p = (*_w) & 31; _p++; \ + *_p = (*_w>>5) & 31; _p++; \ + *_p = (*_w>>10) & 63; _p++; \ + *_p = (*_w>>16) & 63; _p++; \ + *_p = (*_w>>22) & 63; _p++; \ + break; \ + case 12: \ + *_p = (*_w) & 127; _p++; \ + *_p = (*_w>>7) & 127; _p++; \ + *_p = (*_w>>14) & 127; _p++; \ + *_p = (*_w>>21) & 127; _p++; \ + break; \ + case 13: \ + *_p = (*_w) & 1023; _p++; \ + *_p = (*_w>>10) & 511; _p++; \ + *_p = (*_w>>19) & 511; _p++; \ + break; \ + case 14: \ + *_p = (*_w) & 16383; _p++; \ + *_p = (*_w>>14) & 16383; _p++; \ + break; \ + case 15: \ + *_p = (*_w) & ((1<<28)-1); _p++; \ + break; \ + }\ + _w++; \ +} + + + + + diff --git a/aux/OPT_PFD/unpack.h b/aux/OPT_PFD/unpack.h new file mode 100644 index 0000000..fa810e9 --- /dev/null +++ b/aux/OPT_PFD/unpack.h @@ -0,0 +1,773 @@ + +/*************************************************************/ +/* macros for fast unpacking of integers of fixed bit length */ +/*************************************************************/ + +#define BS 128 + +/* supported bit lengths */ +int cnum[17] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,32}; + +void unpack0(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i++) p[i] = 0; +} + + +void unpack1(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 1) + { + p[0] = (w[0] >> 31); + p[1] = (w[0] >> 30) & 1; + p[2] = (w[0] >> 29) & 1; + p[3] = (w[0] >> 28) & 1; + p[4] = (w[0] >> 27) & 1; + p[5] = (w[0] >> 26) & 1; + p[6] = (w[0] >> 25) & 1; + p[7] = (w[0] >> 24) & 1; + p[8] = (w[0] >> 23) & 1; + p[9] = (w[0] >> 22) & 1; + p[10] = (w[0] >> 21) & 1; + p[11] = (w[0] >> 20) & 1; + p[12] = (w[0] >> 19) & 1; + p[13] = (w[0] >> 18) & 1; + p[14] = (w[0] >> 17) & 1; + p[15] = (w[0] >> 16) & 1; + p[16] = (w[0] >> 15) & 1; + p[17] = (w[0] >> 14) & 1; + p[18] = (w[0] >> 13) & 1; + p[19] = (w[0] >> 12) & 1; + p[20] = (w[0] >> 11) & 1; + p[21] = (w[0] >> 10) & 1; + p[22] = (w[0] >> 9) & 1; + p[23] = (w[0] >> 8) & 1; + p[24] = (w[0] >> 7) & 1; + p[25] = (w[0] >> 6) & 1; + p[26] = (w[0] >> 5) & 1; + p[27] = (w[0] >> 4) & 1; + p[28] = (w[0] >> 3) & 1; + p[29] = (w[0] >> 2) & 1; + p[30] = (w[0] >> 1) & 1; + p[31] = (w[0]) & 1; + } +} + + +void unpack2(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 2) + { + p[0] = (w[0] >> 30); + p[1] = (w[0] >> 28) & 3; + p[2] = (w[0] >> 26) & 3; + p[3] = (w[0] >> 24) & 3; + p[4] = (w[0] >> 22) & 3; + p[5] = (w[0] >> 20) & 3; + p[6] = (w[0] >> 18) & 3; + p[7] = (w[0] >> 16) & 3; + p[8] = (w[0] >> 14) & 3; + p[9] = (w[0] >> 12) & 3; + p[10] = (w[0] >> 10) & 3; + p[11] = (w[0] >> 8) & 3; + p[12] = (w[0] >> 6) & 3; + p[13] = (w[0] >> 4) & 3; + p[14] = (w[0] >> 2) & 3; + p[15] = (w[0]) & 3; + p[16] = (w[1] >> 30); + p[17] = (w[1] >> 28) & 3; + p[18] = (w[1] >> 26) & 3; + p[19] = (w[1] >> 24) & 3; + p[20] = (w[1] >> 22) & 3; + p[21] = (w[1] >> 20) & 3; + p[22] = (w[1] >> 18) & 3; + p[23] = (w[1] >> 16) & 3; + p[24] = (w[1] >> 14) & 3; + p[25] = (w[1] >> 12) & 3; + p[26] = (w[1] >> 10) & 3; + p[27] = (w[1] >> 8) & 3; + p[28] = (w[1] >> 6) & 3; + p[29] = (w[1] >> 4) & 3; + p[30] = (w[1] >> 2) & 3; + p[31] = (w[1]) & 3; + } +} + + +void unpack3(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 3) + { + p[0] = (w[0] >> 29); + p[1] = (w[0] >> 26) & 7; + p[2] = (w[0] >> 23) & 7; + p[3] = (w[0] >> 20) & 7; + p[4] = (w[0] >> 17) & 7; + p[5] = (w[0] >> 14) & 7; + p[6] = (w[0] >> 11) & 7; + p[7] = (w[0] >> 8) & 7; + p[8] = (w[0] >> 5) & 7; + p[9] = (w[0] >> 2) & 7; + p[10] = (w[0] << 1) & 7; + p[10] |= (w[1] >> 31); + p[11] = (w[1] >> 28) & 7; + p[12] = (w[1] >> 25) & 7; + p[13] = (w[1] >> 22) & 7; + p[14] = (w[1] >> 19) & 7; + p[15] = (w[1] >> 16) & 7; + p[16] = (w[1] >> 13) & 7; + p[17] = (w[1] >> 10) & 7; + p[18] = (w[1] >> 7) & 7; + p[19] = (w[1] >> 4) & 7; + p[20] = (w[1] >> 1) & 7; + p[21] = (w[1] << 2) & 7; + p[21] |= (w[2] >> 30); + p[22] = (w[2] >> 27) & 7; + p[23] = (w[2] >> 24) & 7; + p[24] = (w[2] >> 21) & 7; + p[25] = (w[2] >> 18) & 7; + p[26] = (w[2] >> 15) & 7; + p[27] = (w[2] >> 12) & 7; + p[28] = (w[2] >> 9) & 7; + p[29] = (w[2] >> 6) & 7; + p[30] = (w[2] >> 3) & 7; + p[31] = (w[2]) & 7; + } +} + + +void unpack4(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 4) + { + p[0] = (w[0] >> 28); + p[1] = (w[0] >> 24) & 15; + p[2] = (w[0] >> 20) & 15; + p[3] = (w[0] >> 16) & 15; + p[4] = (w[0] >> 12) & 15; + p[5] = (w[0] >> 8) & 15; + p[6] = (w[0] >> 4) & 15; + p[7] = (w[0]) & 15; + p[8] = (w[1] >> 28); + p[9] = (w[1] >> 24) & 15; + p[10] = (w[1] >> 20) & 15; + p[11] = (w[1] >> 16) & 15; + p[12] = (w[1] >> 12) & 15; + p[13] = (w[1] >> 8) & 15; + p[14] = (w[1] >> 4) & 15; + p[15] = (w[1]) & 15; + p[16] = (w[2] >> 28); + p[17] = (w[2] >> 24) & 15; + p[18] = (w[2] >> 20) & 15; + p[19] = (w[2] >> 16) & 15; + p[20] = (w[2] >> 12) & 15; + p[21] = (w[2] >> 8) & 15; + p[22] = (w[2] >> 4) & 15; + p[23] = (w[2]) & 15; + p[24] = (w[3] >> 28); + p[25] = (w[3] >> 24) & 15; + p[26] = (w[3] >> 20) & 15; + p[27] = (w[3] >> 16) & 15; + p[28] = (w[3] >> 12) & 15; + p[29] = (w[3] >> 8) & 15; + p[30] = (w[3] >> 4) & 15; + p[31] = (w[3]) & 15; + } +} + + +void unpack5(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 5) + { + p[0] = (w[0] >> 27); + p[1] = (w[0] >> 22) & 31; + p[2] = (w[0] >> 17) & 31; + p[3] = (w[0] >> 12) & 31; + p[4] = (w[0] >> 7) & 31; + p[5] = (w[0] >> 2) & 31; + p[6] = (w[0] << 3) & 31; + p[6] |= (w[1] >> 29); + p[7] = (w[1] >> 24) & 31; + p[8] = (w[1] >> 19) & 31; + p[9] = (w[1] >> 14) & 31; + p[10] = (w[1] >> 9) & 31; + p[11] = (w[1] >> 4) & 31; + p[12] = (w[1] << 1) & 31; + p[12] |= (w[2] >> 31); + p[13] = (w[2] >> 26) & 31; + p[14] = (w[2] >> 21) & 31; + p[15] = (w[2] >> 16) & 31; + p[16] = (w[2] >> 11) & 31; + p[17] = (w[2] >> 6) & 31; + p[18] = (w[2] >> 1) & 31; + p[19] = (w[2] << 4) & 31; + p[19] |= (w[3] >> 28); + p[20] = (w[3] >> 23) & 31; + p[21] = (w[3] >> 18) & 31; + p[22] = (w[3] >> 13) & 31; + p[23] = (w[3] >> 8) & 31; + p[24] = (w[3] >> 3) & 31; + p[25] = (w[3] << 2) & 31; + p[25] |= (w[4] >> 30); + p[26] = (w[4] >> 25) & 31; + p[27] = (w[4] >> 20) & 31; + p[28] = (w[4] >> 15) & 31; + p[29] = (w[4] >> 10) & 31; + p[30] = (w[4] >> 5) & 31; + p[31] = (w[4]) & 31; + } +} + + +void unpack6(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 6) + { + p[0] = (w[0] >> 26); + p[1] = (w[0] >> 20) & 63; + p[2] = (w[0] >> 14) & 63; + p[3] = (w[0] >> 8) & 63; + p[4] = (w[0] >> 2) & 63; + p[5] = (w[0] << 4) & 63; + p[5] |= (w[1] >> 28); + p[6] = (w[1] >> 22) & 63; + p[7] = (w[1] >> 16) & 63; + p[8] = (w[1] >> 10) & 63; + p[9] = (w[1] >> 4) & 63; + p[10] = (w[1] << 2) & 63; + p[10] |= (w[2] >> 30); + p[11] = (w[2] >> 24) & 63; + p[12] = (w[2] >> 18) & 63; + p[13] = (w[2] >> 12) & 63; + p[14] = (w[2] >> 6) & 63; + p[15] = (w[2]) & 63; + p[16] = (w[3] >> 26); + p[17] = (w[3] >> 20) & 63; + p[18] = (w[3] >> 14) & 63; + p[19] = (w[3] >> 8) & 63; + p[20] = (w[3] >> 2) & 63; + p[21] = (w[3] << 4) & 63; + p[21] |= (w[4] >> 28); + p[22] = (w[4] >> 22) & 63; + p[23] = (w[4] >> 16) & 63; + p[24] = (w[4] >> 10) & 63; + p[25] = (w[4] >> 4) & 63; + p[26] = (w[4] << 2) & 63; + p[26] |= (w[5] >> 30); + p[27] = (w[5] >> 24) & 63; + p[28] = (w[5] >> 18) & 63; + p[29] = (w[5] >> 12) & 63; + p[30] = (w[5] >> 6) & 63; + p[31] = (w[5]) & 63; + } +} + + +void unpack7(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 7) + { + p[0] = (w[0] >> 25); + p[1] = (w[0] >> 18) & 127; + p[2] = (w[0] >> 11) & 127; + p[3] = (w[0] >> 4) & 127; + p[4] = (w[0] << 3) & 127; + p[4] |= (w[1] >> 29); + p[5] = (w[1] >> 22) & 127; + p[6] = (w[1] >> 15) & 127; + p[7] = (w[1] >> 8) & 127; + p[8] = (w[1] >> 1) & 127; + p[9] = (w[1] << 6) & 127; + p[9] |= (w[2] >> 26); + p[10] = (w[2] >> 19) & 127; + p[11] = (w[2] >> 12) & 127; + p[12] = (w[2] >> 5) & 127; + p[13] = (w[2] << 2) & 127; + p[13] |= (w[3] >> 30); + p[14] = (w[3] >> 23) & 127; + p[15] = (w[3] >> 16) & 127; + p[16] = (w[3] >> 9) & 127; + p[17] = (w[3] >> 2) & 127; + p[18] = (w[3] << 5) & 127; + p[18] |= (w[4] >> 27); + p[19] = (w[4] >> 20) & 127; + p[20] = (w[4] >> 13) & 127; + p[21] = (w[4] >> 6) & 127; + p[22] = (w[4] << 1) & 127; + p[22] |= (w[5] >> 31); + p[23] = (w[5] >> 24) & 127; + p[24] = (w[5] >> 17) & 127; + p[25] = (w[5] >> 10) & 127; + p[26] = (w[5] >> 3) & 127; + p[27] = (w[5] << 4) & 127; + p[27] |= (w[6] >> 28); + p[28] = (w[6] >> 21) & 127; + p[29] = (w[6] >> 14) & 127; + p[30] = (w[6] >> 7) & 127; + p[31] = (w[6]) & 127; + } +} + + +void unpack8(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 8) + { + p[0] = (w[0] >> 24); + p[1] = (w[0] >> 16) & 255; + p[2] = (w[0] >> 8) & 255; + p[3] = (w[0]) & 255; + p[4] = (w[1] >> 24); + p[5] = (w[1] >> 16) & 255; + p[6] = (w[1] >> 8) & 255; + p[7] = (w[1]) & 255; + p[8] = (w[2] >> 24); + p[9] = (w[2] >> 16) & 255; + p[10] = (w[2] >> 8) & 255; + p[11] = (w[2]) & 255; + p[12] = (w[3] >> 24); + p[13] = (w[3] >> 16) & 255; + p[14] = (w[3] >> 8) & 255; + p[15] = (w[3]) & 255; + p[16] = (w[4] >> 24); + p[17] = (w[4] >> 16) & 255; + p[18] = (w[4] >> 8) & 255; + p[19] = (w[4]) & 255; + p[20] = (w[5] >> 24); + p[21] = (w[5] >> 16) & 255; + p[22] = (w[5] >> 8) & 255; + p[23] = (w[5]) & 255; + p[24] = (w[6] >> 24); + p[25] = (w[6] >> 16) & 255; + p[26] = (w[6] >> 8) & 255; + p[27] = (w[6]) & 255; + p[28] = (w[7] >> 24); + p[29] = (w[7] >> 16) & 255; + p[30] = (w[7] >> 8) & 255; + p[31] = (w[7]) & 255; + } +} + + +void unpack9(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 9) + { + p[0] = (w[0] >> 23); + p[1] = (w[0] >> 14) & 511; + p[2] = (w[0] >> 5) & 511; + p[3] = (w[0] << 4) & 511; + p[3] |= (w[1] >> 28); + p[4] = (w[1] >> 19) & 511; + p[5] = (w[1] >> 10) & 511; + p[6] = (w[1] >> 1) & 511; + p[7] = (w[1] << 8) & 511; + p[7] |= (w[2] >> 24); + p[8] = (w[2] >> 15) & 511; + p[9] = (w[2] >> 6) & 511; + p[10] = (w[2] << 3) & 511; + p[10] |= (w[3] >> 29); + p[11] = (w[3] >> 20) & 511; + p[12] = (w[3] >> 11) & 511; + p[13] = (w[3] >> 2) & 511; + p[14] = (w[3] << 7) & 511; + p[14] |= (w[4] >> 25); + p[15] = (w[4] >> 16) & 511; + p[16] = (w[4] >> 7) & 511; + p[17] = (w[4] << 2) & 511; + p[17] |= (w[5] >> 30); + p[18] = (w[5] >> 21) & 511; + p[19] = (w[5] >> 12) & 511; + p[20] = (w[5] >> 3) & 511; + p[21] = (w[5] << 6) & 511; + p[21] |= (w[6] >> 26); + p[22] = (w[6] >> 17) & 511; + p[23] = (w[6] >> 8) & 511; + p[24] = (w[6] << 1) & 511; + p[24] |= (w[7] >> 31); + p[25] = (w[7] >> 22) & 511; + p[26] = (w[7] >> 13) & 511; + p[27] = (w[7] >> 4) & 511; + p[28] = (w[7] << 5) & 511; + p[28] |= (w[8] >> 27); + p[29] = (w[8] >> 18) & 511; + p[30] = (w[8] >> 9) & 511; + p[31] = (w[8]) & 511; + } +} + + +void unpack10(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 10) + { + p[0] = (w[0] >> 22); + p[1] = (w[0] >> 12) & 1023; + p[2] = (w[0] >> 2) & 1023; + p[3] = (w[0] << 8) & 1023; + p[3] |= (w[1] >> 24); + p[4] = (w[1] >> 14) & 1023; + p[5] = (w[1] >> 4) & 1023; + p[6] = (w[1] << 6) & 1023; + p[6] |= (w[2] >> 26); + p[7] = (w[2] >> 16) & 1023; + p[8] = (w[2] >> 6) & 1023; + p[9] = (w[2] << 4) & 1023; + p[9] |= (w[3] >> 28); + p[10] = (w[3] >> 18) & 1023; + p[11] = (w[3] >> 8) & 1023; + p[12] = (w[3] << 2) & 1023; + p[12] |= (w[4] >> 30); + p[13] = (w[4] >> 20) & 1023; + p[14] = (w[4] >> 10) & 1023; + p[15] = (w[4]) & 1023; + p[16] = (w[5] >> 22); + p[17] = (w[5] >> 12) & 1023; + p[18] = (w[5] >> 2) & 1023; + p[19] = (w[5] << 8) & 1023; + p[19] |= (w[6] >> 24); + p[20] = (w[6] >> 14) & 1023; + p[21] = (w[6] >> 4) & 1023; + p[22] = (w[6] << 6) & 1023; + p[22] |= (w[7] >> 26); + p[23] = (w[7] >> 16) & 1023; + p[24] = (w[7] >> 6) & 1023; + p[25] = (w[7] << 4) & 1023; + p[25] |= (w[8] >> 28); + p[26] = (w[8] >> 18) & 1023; + p[27] = (w[8] >> 8) & 1023; + p[28] = (w[8] << 2) & 1023; + p[28] |= (w[9] >> 30); + p[29] = (w[9] >> 20) & 1023; + p[30] = (w[9] >> 10) & 1023; + p[31] = (w[9]) & 1023; + } +} + + +void unpack11(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 11) + { + p[0] = (w[0] >> 21); + p[1] = (w[0] >> 10) & 2047; + p[2] = (w[0] << 1) & 2047; + p[2] |= (w[1] >> 31); + p[3] = (w[1] >> 20) & 2047; + p[4] = (w[1] >> 9) & 2047; + p[5] = (w[1] << 2) & 2047; + p[5] |= (w[2] >> 30); + p[6] = (w[2] >> 19) & 2047; + p[7] = (w[2] >> 8) & 2047; + p[8] = (w[2] << 3) & 2047; + p[8] |= (w[3] >> 29); + p[9] = (w[3] >> 18) & 2047; + p[10] = (w[3] >> 7) & 2047; + p[11] = (w[3] << 4) & 2047; + p[11] |= (w[4] >> 28); + p[12] = (w[4] >> 17) & 2047; + p[13] = (w[4] >> 6) & 2047; + p[14] = (w[4] << 5) & 2047; + p[14] |= (w[5] >> 27); + p[15] = (w[5] >> 16) & 2047; + p[16] = (w[5] >> 5) & 2047; + p[17] = (w[5] << 6) & 2047; + p[17] |= (w[6] >> 26); + p[18] = (w[6] >> 15) & 2047; + p[19] = (w[6] >> 4) & 2047; + p[20] = (w[6] << 7) & 2047; + p[20] |= (w[7] >> 25); + p[21] = (w[7] >> 14) & 2047; + p[22] = (w[7] >> 3) & 2047; + p[23] = (w[7] << 8) & 2047; + p[23] |= (w[8] >> 24); + p[24] = (w[8] >> 13) & 2047; + p[25] = (w[8] >> 2) & 2047; + p[26] = (w[8] << 9) & 2047; + p[26] |= (w[9] >> 23); + p[27] = (w[9] >> 12) & 2047; + p[28] = (w[9] >> 1) & 2047; + p[29] = (w[9] << 10) & 2047; + p[29] |= (w[10] >> 22); + p[30] = (w[10] >> 11) & 2047; + p[31] = (w[10]) & 2047; + } +} + + +void unpack12(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 12) + { + p[0] = (w[0] >> 20); + p[1] = (w[0] >> 8) & 4095; + p[2] = (w[0] << 4) & 4095; + p[2] |= (w[1] >> 28); + p[3] = (w[1] >> 16) & 4095; + p[4] = (w[1] >> 4) & 4095; + p[5] = (w[1] << 8) & 4095; + p[5] |= (w[2] >> 24); + p[6] = (w[2] >> 12) & 4095; + p[7] = (w[2]) & 4095; + p[8] = (w[3] >> 20); + p[9] = (w[3] >> 8) & 4095; + p[10] = (w[3] << 4) & 4095; + p[10] |= (w[4] >> 28); + p[11] = (w[4] >> 16) & 4095; + p[12] = (w[4] >> 4) & 4095; + p[13] = (w[4] << 8) & 4095; + p[13] |= (w[5] >> 24); + p[14] = (w[5] >> 12) & 4095; + p[15] = (w[5]) & 4095; + p[16] = (w[6] >> 20); + p[17] = (w[6] >> 8) & 4095; + p[18] = (w[6] << 4) & 4095; + p[18] |= (w[7] >> 28); + p[19] = (w[7] >> 16) & 4095; + p[20] = (w[7] >> 4) & 4095; + p[21] = (w[7] << 8) & 4095; + p[21] |= (w[8] >> 24); + p[22] = (w[8] >> 12) & 4095; + p[23] = (w[8]) & 4095; + p[24] = (w[9] >> 20); + p[25] = (w[9] >> 8) & 4095; + p[26] = (w[9] << 4) & 4095; + p[26] |= (w[10] >> 28); + p[27] = (w[10] >> 16) & 4095; + p[28] = (w[10] >> 4) & 4095; + p[29] = (w[10] << 8) & 4095; + p[29] |= (w[11] >> 24); + p[30] = (w[11] >> 12) & 4095; + p[31] = (w[11]) & 4095; + } +} + + +void unpack13(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 13) + { + p[0] = (w[0] >> 19); + p[1] = (w[0] >> 6) & 8191; + p[2] = (w[0] << 7) & 8191; + p[2] |= (w[1] >> 25); + p[3] = (w[1] >> 12) & 8191; + p[4] = (w[1] << 1) & 8191; + p[4] |= (w[2] >> 31); + p[5] = (w[2] >> 18) & 8191; + p[6] = (w[2] >> 5) & 8191; + p[7] = (w[2] << 8) & 8191; + p[7] |= (w[3] >> 24); + p[8] = (w[3] >> 11) & 8191; + p[9] = (w[3] << 2) & 8191; + p[9] |= (w[4] >> 30); + p[10] = (w[4] >> 17) & 8191; + p[11] = (w[4] >> 4) & 8191; + p[12] = (w[4] << 9) & 8191; + p[12] |= (w[5] >> 23); + p[13] = (w[5] >> 10) & 8191; + p[14] = (w[5] << 3) & 8191; + p[14] |= (w[6] >> 29); + p[15] = (w[6] >> 16) & 8191; + p[16] = (w[6] >> 3) & 8191; + p[17] = (w[6] << 10) & 8191; + p[17] |= (w[7] >> 22); + p[18] = (w[7] >> 9) & 8191; + p[19] = (w[7] << 4) & 8191; + p[19] |= (w[8] >> 28); + p[20] = (w[8] >> 15) & 8191; + p[21] = (w[8] >> 2) & 8191; + p[22] = (w[8] << 11) & 8191; + p[22] |= (w[9] >> 21); + p[23] = (w[9] >> 8) & 8191; + p[24] = (w[9] << 5) & 8191; + p[24] |= (w[10] >> 27); + p[25] = (w[10] >> 14) & 8191; + p[26] = (w[10] >> 1) & 8191; + p[27] = (w[10] << 12) & 8191; + p[27] |= (w[11] >> 20); + p[28] = (w[11] >> 7) & 8191; + p[29] = (w[11] << 6) & 8191; + p[29] |= (w[12] >> 26); + p[30] = (w[12] >> 13) & 8191; + p[31] = (w[12]) & 8191; + } +} + + +void unpack16(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 16) + { + p[0] = (w[0] >> 16); + p[1] = (w[0]) & 65535; + p[2] = (w[1] >> 16); + p[3] = (w[1]) & 65535; + p[4] = (w[2] >> 16); + p[5] = (w[2]) & 65535; + p[6] = (w[3] >> 16); + p[7] = (w[3]) & 65535; + p[8] = (w[4] >> 16); + p[9] = (w[4]) & 65535; + p[10] = (w[5] >> 16); + p[11] = (w[5]) & 65535; + p[12] = (w[6] >> 16); + p[13] = (w[6]) & 65535; + p[14] = (w[7] >> 16); + p[15] = (w[7]) & 65535; + p[16] = (w[8] >> 16); + p[17] = (w[8]) & 65535; + p[18] = (w[9] >> 16); + p[19] = (w[9]) & 65535; + p[20] = (w[10] >> 16); + p[21] = (w[10]) & 65535; + p[22] = (w[11] >> 16); + p[23] = (w[11]) & 65535; + p[24] = (w[12] >> 16); + p[25] = (w[12]) & 65535; + p[26] = (w[13] >> 16); + p[27] = (w[13]) & 65535; + p[28] = (w[14] >> 16); + p[29] = (w[14]) & 65535; + p[30] = (w[15] >> 16); + p[31] = (w[15]) & 65535; + } +} + + +void unpack20(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 20) + { + p[0] = (w[0] >> 12); + p[1] = (w[0] << 8) & ((1<<20)-1); + p[1] |= (w[1] >> 24); + p[2] = (w[1] >> 4) & ((1<<20)-1); + p[3] = (w[1] << 16) & ((1<<20)-1); + p[3] |= (w[2] >> 16); + p[4] = (w[2] << 4) & ((1<<20)-1); + p[4] |= (w[3] >> 28); + p[5] = (w[3] >> 8) & ((1<<20)-1); + p[6] = (w[3] << 12) & ((1<<20)-1); + p[6] |= (w[4] >> 20); + p[7] = (w[4]) & ((1<<20)-1); + p[8] = (w[5] >> 12); + p[9] = (w[5] << 8) & ((1<<20)-1); + p[9] |= (w[6] >> 24); + p[10] = (w[6] >> 4) & ((1<<20)-1); + p[11] = (w[6] << 16) & ((1<<20)-1); + p[11] |= (w[7] >> 16); + p[12] = (w[7] << 4) & ((1<<20)-1); + p[12] |= (w[8] >> 28); + p[13] = (w[8] >> 8) & ((1<<20)-1); + p[14] = (w[8] << 12) & ((1<<20)-1); + p[14] |= (w[9] >> 20); + p[15] = (w[9]) & ((1<<20)-1); + p[16] = (w[10] >> 12); + p[17] = (w[10] << 8) & ((1<<20)-1); + p[17] |= (w[11] >> 24); + p[18] = (w[11] >> 4) & ((1<<20)-1); + p[19] = (w[11] << 16) & ((1<<20)-1); + p[19] |= (w[12] >> 16); + p[20] = (w[12] << 4) & ((1<<20)-1); + p[20] |= (w[13] >> 28); + p[21] = (w[13] >> 8) & ((1<<20)-1); + p[22] = (w[13] << 12) & ((1<<20)-1); + p[22] |= (w[14] >> 20); + p[23] = (w[14]) & ((1<<20)-1); + p[24] = (w[15] >> 12); + p[25] = (w[15] << 8) & ((1<<20)-1); + p[25] |= (w[16] >> 24); + p[26] = (w[16] >> 4) & ((1<<20)-1); + p[27] = (w[16] << 16) & ((1<<20)-1); + p[27] |= (w[17] >> 16); + p[28] = (w[17] << 4) & ((1<<20)-1); + p[28] |= (w[18] >> 28); + p[29] = (w[18] >> 8) & ((1<<20)-1); + p[30] = (w[18] << 12) & ((1<<20)-1); + p[30] |= (w[19] >> 20); + p[31] = (w[19]) & ((1<<20)-1); + } +} + + +void unpack32(unsigned int *p, unsigned int *w) +{ + int i; + + for (i = 0; i < BS; i += 32, p += 32, w += 32) + { + p[0] = w[0]; + p[1] = w[1]; + p[2] = w[2]; + p[3] = w[3]; + p[4] = w[4]; + p[5] = w[5]; + p[6] = w[6]; + p[7] = w[7]; + p[8] = w[8]; + p[9] = w[9]; + p[10] = w[10]; + p[11] = w[11]; + p[12] = w[12]; + p[13] = w[13]; + p[14] = w[14]; + p[15] = w[15]; + p[16] = w[16]; + p[17] = w[17]; + p[18] = w[18]; + p[19] = w[19]; + p[20] = w[20]; + p[21] = w[21]; + p[22] = w[22]; + p[23] = w[23]; + p[24] = w[24]; + p[25] = w[25]; + p[26] = w[26]; + p[27] = w[27]; + p[28] = w[28]; + p[29] = w[29]; + p[30] = w[30]; + p[31] = w[31]; + } +} + + +typedef void (*pf)(unsigned int *p, unsigned int *w); +pf unpack[17] = {unpack0, unpack1, unpack2, unpack3, unpack4, unpack5, + unpack6, unpack7, unpack8, unpack9, unpack10, unpack11, + unpack12, unpack13, unpack16, unpack20, unpack32}; + diff --git a/aux/simdcomp/bitpacka.c b/aux/simdcomp/bitpacka.c new file mode 100644 index 0000000..d23507d --- /dev/null +++ b/aux/simdcomp/bitpacka.c @@ -0,0 +1,17773 @@ +#include "bitpacka.h" +#define INLINE inline +uint32_t * nullpacker(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + return out; +} + + const uint32_t * nullunpacker8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,8 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in++) ; + *out |= ( (*in++) ) << 1 ; + *out |= ( (*in++) ) << 2 ; + *out |= ( (*in++) ) << 3 ; + *out |= ( (*in++) ) << 4 ; + *out |= ( (*in++) ) << 5 ; + *out |= ( (*in++) ) << 6 ; + *out |= ( (*in++) ) << 7 ; + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in++) ; + *out |= ( (*in++) ) << 2 ; + *out |= ( (*in++) ) << 4 ; + *out |= ( (*in++) ) << 6 ; + *out |= ( (*in++) ) << 8 ; + *out |= ( (*in++) ) << 10 ; + *out |= ( (*in++) ) << 12 ; + *out |= ( (*in++) ) << 14 ; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask3_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask7_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask11_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask15_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask19_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask23_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask27_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask31_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + +#if 0 +#define OUTI(__x) *out++ +#define OUT(__x) *out +#define OUI out++ +#else +#define OUTI(__x) out[__x] +#define OUT(__x) out[__x] +#define OUI +#endif +const INLINE uint32_t * __fastunpack1_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) & 1; + OUTI( 1) = ( (*in) >> 1 ) & 1; + OUTI( 2) = ( (*in) >> 2 ) & 1; + OUTI( 3) = ( (*in) >> 3 ) & 1; + OUTI( 4) = ( (*in) >> 4 ) & 1; + OUTI( 5) = ( (*in) >> 5 ) & 1; + OUTI( 6) = ( (*in) >> 6 ) & 1; + OUTI( 7) = ( (*in) >> 7 ) & 1; + return in + 1; +} + +const INLINE uint32_t * __fastunpack2_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; + OUTI( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; + OUTI( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; + OUTI( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; + OUTI( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; + OUTI( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; + OUTI( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; + OUTI( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack3_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; + OUTI( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; + OUTI( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; + OUTI( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; + OUTI( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; + OUTI( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; + OUTI( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; + OUTI( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack4_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; + OUTI( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; + OUTI( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; + OUTI( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; + OUTI( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; + OUTI( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; + OUTI( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; + OUTI( 7) = ( (*in++) >> 28 ) ; + return in; +} + +const uint32_t * __fastunpack5_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; + OUTI( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; + OUTI( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; + OUTI( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; + OUTI( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; + OUTI( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; + OUT( 6) = ( (*in++) >> 30 ) ; + OUT( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + OUI; + OUTI( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack6_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; + OUTI( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; + OUTI( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; + OUTI( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; + OUTI( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; + OUT( 5) = ( (*in++) >> 30 ) ; + OUT( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + OUI; + OUTI( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; + OUTI( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack7_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; + OUTI( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; + OUTI( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; + OUTI( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; + OUT( 4) = ( (*in++) >> 28 ) ; + OUT( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + OUI; + OUTI( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; + OUTI( 6 ) = ( (*in) >> 10 ) % (1U << 7 ) ; + OUTI( 7 ) = ( (*in) >> 17 ) % (1U << 7 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack8_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; + OUTI( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; + OUTI( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; + OUTI( 3) = ( (*in++) >> 24 ) ; + OUTI( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; + OUTI( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; + OUTI( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; + OUTI( 7) = ( (*in++) >> 24 ) ; + return in; +} + +const INLINE uint32_t * __fastunpack9_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; + OUTI( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; + OUTI( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; + OUT( 3) = ( (*in++) >> 27 ) ; + OUT( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + OUI; + OUTI( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; + OUTI( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; + OUTI( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; + OUT( 7) = ( (*in++) >> 31 ) ; + OUT( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + OUI; + return in + 1; +} + +const INLINE uint32_t * __fastunpack10_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; + OUTI( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; + OUTI( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; + OUT( 3) = ( (*in++) >> 30 ) ; + OUT( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + OUI; + OUTI( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; + OUTI( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; + OUT( 6) = ( (*in++) >> 28 ) ; + OUT( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + OUI; + OUTI( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack11_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ((*in) >> 0 ) % (1U << 11 ) ; + OUTI( 1) = ((*in) >> 11 ) % (1U << 11 ) ; + OUT( 2) = ((*in++) >> 22 ) ; + OUT( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + OUI; + OUTI( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; + OUTI( 4) = ((*in) >> 12 ) % (1U << 11 ) ; + OUT( 5) = (*in++) >> 23; + OUT( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + OUI; + OUTI( 6) = ((*in) >> 2 ) % (1U << 11 ) ; + OUTI( 7) = ((*in) >> 13 ) % (1U << 11 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack12_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; + OUTI( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; + OUT( 2) = ( (*in++) >> 24 ) ; + OUT( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + OUI; + OUTI( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; + OUTI( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; + OUT( 5) = ( (*in++) >> 28 ) ; + OUT( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + OUI; + OUTI( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; + OUTI( 7) = ( (*in++) >> 20 ) ; + return in; +} + +const INLINE uint32_t * __fastunpack13_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + OUTI( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; + OUTI( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; + OUT( 2) = ( (*in++) >> 26 ) ; + OUT( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + OUI; + OUTI( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; + OUT( 4) = ( (*in++) >> 20 ) ; + OUT( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + OUI; + OUTI( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; + OUTI( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; + OUT( 7) = ( (*in++) >> 27 ); + OUT( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + OUI; + return in + 1; +} + +const INLINE uint32_t * __fastunpack14_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + *out++ = ( (*in) >> 0 ) % (1U << 14 ) ; + *out++ = ( (*in) >> 14 ) % (1U << 14 ) ; + *out = ( (*in++) >> 28 ) ; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out++ = ( (*in) >> 10 ) % (1U << 14 ) ; + *out = ( (*in++) >> 24 ) ; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out++ = ( (*in) >> 6 ) % (1U << 14 ) ; + *out = ( (*in++) >> 20 ) ; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out++ = ( (*in) >> 2 ) % (1U << 14 ) ; + return in + 1; +} + +const INLINE uint32_t * __fastunpack15_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack16_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const INLINE uint32_t * __fastunpack17_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack18_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack19_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack20_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const INLINE uint32_t * __fastunpack21_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + + return in + 1; + } + + + + +const INLINE uint32_t * __fastunpack22_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack23_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack27_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack31_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker8(in,out); + + case 1: + return __fastunpack1_8(in,out); + + case 2: + return __fastunpack2_8(in,out); + + case 3: + return __fastunpack3_8(in,out); + + case 4: + return __fastunpack4_8(in,out); + + case 5: + return __fastunpack5_8(in,out); + + case 6: + return __fastunpack6_8(in,out); + + case 7: + return __fastunpack7_8(in,out); + + case 8: + return __fastunpack8_8(in,out); + + case 9: + return __fastunpack9_8(in,out); + + case 10: + return __fastunpack10_8(in,out); + + case 11: + return __fastunpack11_8(in,out); + + case 12: + return __fastunpack12_8(in,out); + + case 13: + return __fastunpack13_8(in,out); + + case 14: + return __fastunpack14_8(in,out); + + case 15: + return __fastunpack15_8(in,out); + + case 16: + return __fastunpack16_8(in,out); + + case 17: + return __fastunpack17_8(in,out); + + case 18: + return __fastunpack18_8(in,out); + + case 19: + return __fastunpack19_8(in,out); + + case 20: + return __fastunpack20_8(in,out); + + case 21: + return __fastunpack21_8(in,out); + + case 22: + return __fastunpack22_8(in,out); + + case 23: + return __fastunpack23_8(in,out); + + case 24: + return __fastunpack24_8(in,out); + + case 25: + return __fastunpack25_8(in,out); + + case 26: + return __fastunpack26_8(in,out); + + case 27: + return __fastunpack27_8(in,out); + + case 28: + return __fastunpack28_8(in,out); + + case 29: + return __fastunpack29_8(in,out); + + case 30: + return __fastunpack30_8(in,out); + + case 31: + return __fastunpack31_8(in,out); + + case 32: + return __fastunpack32_8(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_8(in,out); + + case 2: + return __fastpackwithoutmask2_8(in,out); + + case 3: + return __fastpackwithoutmask3_8(in,out); + + case 4: + return __fastpackwithoutmask4_8(in,out); + + case 5: + return __fastpackwithoutmask5_8(in,out); + + case 6: + return __fastpackwithoutmask6_8(in,out); + + case 7: + return __fastpackwithoutmask7_8(in,out); + + case 8: + return __fastpackwithoutmask8_8(in,out); + + case 9: + return __fastpackwithoutmask9_8(in,out); + + case 10: + return __fastpackwithoutmask10_8(in,out); + + case 11: + return __fastpackwithoutmask11_8(in,out); + + case 12: + return __fastpackwithoutmask12_8(in,out); + + case 13: + return __fastpackwithoutmask13_8(in,out); + + case 14: + return __fastpackwithoutmask14_8(in,out); + + case 15: + return __fastpackwithoutmask15_8(in,out); + + case 16: + return __fastpackwithoutmask16_8(in,out); + + case 17: + return __fastpackwithoutmask17_8(in,out); + + case 18: + return __fastpackwithoutmask18_8(in,out); + + case 19: + return __fastpackwithoutmask19_8(in,out); + + case 20: + return __fastpackwithoutmask20_8(in,out); + + case 21: + return __fastpackwithoutmask21_8(in,out); + + case 22: + return __fastpackwithoutmask22_8(in,out); + + case 23: + return __fastpackwithoutmask23_8(in,out); + + case 24: + return __fastpackwithoutmask24_8(in,out); + + case 25: + return __fastpackwithoutmask25_8(in,out); + + case 26: + return __fastpackwithoutmask26_8(in,out); + + case 27: + return __fastpackwithoutmask27_8(in,out); + + case 28: + return __fastpackwithoutmask28_8(in,out); + + case 29: + return __fastpackwithoutmask29_8(in,out); + + case 30: + return __fastpackwithoutmask30_8(in,out); + + case 31: + return __fastpackwithoutmask31_8(in,out); + + case 32: + return __fastpackwithoutmask32_8(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,16 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask3_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask7_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask11_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask15_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask19_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask23_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask27_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask31_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + + + + +const uint32_t * __fastunpack1_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack2_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack3_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack4_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack5_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack6_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack7_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack8_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack9_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack10_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack11_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack12_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack13_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack14_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack15_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack16_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack17_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack18_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack19_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack20_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack22_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack23_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack27_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack31_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker16(in,out); + + case 1: + return __fastunpack1_16(in,out); + + case 2: + return __fastunpack2_16(in,out); + + case 3: + return __fastunpack3_16(in,out); + + case 4: + return __fastunpack4_16(in,out); + + case 5: + return __fastunpack5_16(in,out); + + case 6: + return __fastunpack6_16(in,out); + + case 7: + return __fastunpack7_16(in,out); + + case 8: + return __fastunpack8_16(in,out); + + case 9: + return __fastunpack9_16(in,out); + + case 10: + return __fastunpack10_16(in,out); + + case 11: + return __fastunpack11_16(in,out); + + case 12: + return __fastunpack12_16(in,out); + + case 13: + return __fastunpack13_16(in,out); + + case 14: + return __fastunpack14_16(in,out); + + case 15: + return __fastunpack15_16(in,out); + + case 16: + return __fastunpack16_16(in,out); + + case 17: + return __fastunpack17_16(in,out); + + case 18: + return __fastunpack18_16(in,out); + + case 19: + return __fastunpack19_16(in,out); + + case 20: + return __fastunpack20_16(in,out); + + case 21: + return __fastunpack21_16(in,out); + + case 22: + return __fastunpack22_16(in,out); + + case 23: + return __fastunpack23_16(in,out); + + case 24: + return __fastunpack24_16(in,out); + + case 25: + return __fastunpack25_16(in,out); + + case 26: + return __fastunpack26_16(in,out); + + case 27: + return __fastunpack27_16(in,out); + + case 28: + return __fastunpack28_16(in,out); + + case 29: + return __fastunpack29_16(in,out); + + case 30: + return __fastunpack30_16(in,out); + + case 31: + return __fastunpack31_16(in,out); + + case 32: + return __fastunpack32_16(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_16(in,out); + + case 2: + return __fastpackwithoutmask2_16(in,out); + + case 3: + return __fastpackwithoutmask3_16(in,out); + + case 4: + return __fastpackwithoutmask4_16(in,out); + + case 5: + return __fastpackwithoutmask5_16(in,out); + + case 6: + return __fastpackwithoutmask6_16(in,out); + + case 7: + return __fastpackwithoutmask7_16(in,out); + + case 8: + return __fastpackwithoutmask8_16(in,out); + + case 9: + return __fastpackwithoutmask9_16(in,out); + + case 10: + return __fastpackwithoutmask10_16(in,out); + + case 11: + return __fastpackwithoutmask11_16(in,out); + + case 12: + return __fastpackwithoutmask12_16(in,out); + + case 13: + return __fastpackwithoutmask13_16(in,out); + + case 14: + return __fastpackwithoutmask14_16(in,out); + + case 15: + return __fastpackwithoutmask15_16(in,out); + + case 16: + return __fastpackwithoutmask16_16(in,out); + + case 17: + return __fastpackwithoutmask17_16(in,out); + + case 18: + return __fastpackwithoutmask18_16(in,out); + + case 19: + return __fastpackwithoutmask19_16(in,out); + + case 20: + return __fastpackwithoutmask20_16(in,out); + + case 21: + return __fastpackwithoutmask21_16(in,out); + + case 22: + return __fastpackwithoutmask22_16(in,out); + + case 23: + return __fastpackwithoutmask23_16(in,out); + + case 24: + return __fastpackwithoutmask24_16(in,out); + + case 25: + return __fastpackwithoutmask25_16(in,out); + + case 26: + return __fastpackwithoutmask26_16(in,out); + + case 27: + return __fastpackwithoutmask27_16(in,out); + + case 28: + return __fastpackwithoutmask28_16(in,out); + + case 29: + return __fastpackwithoutmask29_16(in,out); + + case 30: + return __fastpackwithoutmask30_16(in,out); + + case 31: + return __fastpackwithoutmask31_16(in,out); + + case 32: + return __fastpackwithoutmask32_16(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,24 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask2_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask3_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask4_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask6_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask7_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask8_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask10_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask11_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask12_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask14_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask15_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask16_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask18_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask19_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask20_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask22_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask23_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask24_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask26_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask27_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask28_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask30_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask31_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + + return out + 1; + } + + + + uint32_t * __fastpackwithoutmask32_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + + + + +const uint32_t * __fastunpack1_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) & 1 ; + out++; + *out = ( (*in) >> 1 ) & 1 ; + out++; + *out = ( (*in) >> 2 ) & 1 ; + out++; + *out = ( (*in) >> 3 ) & 1 ; + out++; + *out = ( (*in) >> 4 ) & 1 ; + out++; + *out = ( (*in) >> 5 ) & 1 ; + out++; + *out = ( (*in) >> 6 ) & 1 ; + out++; + *out = ( (*in) >> 7 ) & 1 ; + out++; + *out = ( (*in) >> 8 ) & 1 ; + out++; + *out = ( (*in) >> 9 ) & 1 ; + out++; + *out = ( (*in) >> 10 ) & 1 ; + out++; + *out = ( (*in) >> 11 ) & 1 ; + out++; + *out = ( (*in) >> 12 ) & 1 ; + out++; + *out = ( (*in) >> 13 ) & 1 ; + out++; + *out = ( (*in) >> 14 ) & 1 ; + out++; + *out = ( (*in) >> 15 ) & 1 ; + out++; + *out = ( (*in) >> 16 ) & 1 ; + out++; + *out = ( (*in) >> 17 ) & 1 ; + out++; + *out = ( (*in) >> 18 ) & 1 ; + out++; + *out = ( (*in) >> 19 ) & 1 ; + out++; + *out = ( (*in) >> 20 ) & 1 ; + out++; + *out = ( (*in) >> 21 ) & 1 ; + out++; + *out = ( (*in) >> 22 ) & 1 ; + out++; + *out = ( (*in) >> 23 ) & 1 ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack2_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 2 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 2 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 2 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack3_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 3 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 27 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 28 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 3 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 3 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack4_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 4 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 4 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack5_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 5 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 25 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 26 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 5 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 5 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack6_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 6 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 6 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 6 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack7_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 7 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 24 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 23 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 19 ) % (1U << 7 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 7 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack8_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 8 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 8 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack9_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 9 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 22 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 21 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 9 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 9 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack10_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 10 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 20 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 10 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 10 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack11_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 11 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 18 ) % (1U << 11 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack12_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 12 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 12 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack13_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 13 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 17 ) % (1U << 13 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 13 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack14_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 14 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 14 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 14 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack15_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 15 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + out++; + *out = ( (*in) >> 13 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 16 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 15 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack16_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 16 ) ; + out++; + *out = ( (*in) >> 16 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack17_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + out++; + *out = ( (*in) >> 14 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 17 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 17 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack18_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack19_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack20_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack22_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack23_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack24_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack26_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack27_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack28_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack30_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack31_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + + return in + 1; + } + + + + +const uint32_t * __fastunpack32_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker24(in,out); + + case 1: + return __fastunpack1_24(in,out); + + case 2: + return __fastunpack2_24(in,out); + + case 3: + return __fastunpack3_24(in,out); + + case 4: + return __fastunpack4_24(in,out); + + case 5: + return __fastunpack5_24(in,out); + + case 6: + return __fastunpack6_24(in,out); + + case 7: + return __fastunpack7_24(in,out); + + case 8: + return __fastunpack8_24(in,out); + + case 9: + return __fastunpack9_24(in,out); + + case 10: + return __fastunpack10_24(in,out); + + case 11: + return __fastunpack11_24(in,out); + + case 12: + return __fastunpack12_24(in,out); + + case 13: + return __fastunpack13_24(in,out); + + case 14: + return __fastunpack14_24(in,out); + + case 15: + return __fastunpack15_24(in,out); + + case 16: + return __fastunpack16_24(in,out); + + case 17: + return __fastunpack17_24(in,out); + + case 18: + return __fastunpack18_24(in,out); + + case 19: + return __fastunpack19_24(in,out); + + case 20: + return __fastunpack20_24(in,out); + + case 21: + return __fastunpack21_24(in,out); + + case 22: + return __fastunpack22_24(in,out); + + case 23: + return __fastunpack23_24(in,out); + + case 24: + return __fastunpack24_24(in,out); + + case 25: + return __fastunpack25_24(in,out); + + case 26: + return __fastunpack26_24(in,out); + + case 27: + return __fastunpack27_24(in,out); + + case 28: + return __fastunpack28_24(in,out); + + case 29: + return __fastunpack29_24(in,out); + + case 30: + return __fastunpack30_24(in,out); + + case 31: + return __fastunpack31_24(in,out); + + case 32: + return __fastunpack32_24(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_24(in,out); + + case 2: + return __fastpackwithoutmask2_24(in,out); + + case 3: + return __fastpackwithoutmask3_24(in,out); + + case 4: + return __fastpackwithoutmask4_24(in,out); + + case 5: + return __fastpackwithoutmask5_24(in,out); + + case 6: + return __fastpackwithoutmask6_24(in,out); + + case 7: + return __fastpackwithoutmask7_24(in,out); + + case 8: + return __fastpackwithoutmask8_24(in,out); + + case 9: + return __fastpackwithoutmask9_24(in,out); + + case 10: + return __fastpackwithoutmask10_24(in,out); + + case 11: + return __fastpackwithoutmask11_24(in,out); + + case 12: + return __fastpackwithoutmask12_24(in,out); + + case 13: + return __fastpackwithoutmask13_24(in,out); + + case 14: + return __fastpackwithoutmask14_24(in,out); + + case 15: + return __fastpackwithoutmask15_24(in,out); + + case 16: + return __fastpackwithoutmask16_24(in,out); + + case 17: + return __fastpackwithoutmask17_24(in,out); + + case 18: + return __fastpackwithoutmask18_24(in,out); + + case 19: + return __fastpackwithoutmask19_24(in,out); + + case 20: + return __fastpackwithoutmask20_24(in,out); + + case 21: + return __fastpackwithoutmask21_24(in,out); + + case 22: + return __fastpackwithoutmask22_24(in,out); + + case 23: + return __fastpackwithoutmask23_24(in,out); + + case 24: + return __fastpackwithoutmask24_24(in,out); + + case 25: + return __fastpackwithoutmask25_24(in,out); + + case 26: + return __fastpackwithoutmask26_24(in,out); + + case 27: + return __fastpackwithoutmask27_24(in,out); + + case 28: + return __fastpackwithoutmask28_24(in,out); + + case 29: + return __fastpackwithoutmask29_24(in,out); + + case 30: + return __fastpackwithoutmask30_24(in,out); + + case 31: + return __fastpackwithoutmask31_24(in,out); + + case 32: + return __fastpackwithoutmask32_24(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + const uint32_t * nullunpacker32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + memset(out,0,32 * 4); + return in; + } + + + uint32_t * __fastpackwithoutmask1_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask2_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask3_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 3 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 3 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask4_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask5_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 5 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 5 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 5 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 5 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask6_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 6 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 6 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask7_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 7 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 7 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 7 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 7 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 7 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 7 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask8_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask9_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 9 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 9 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 9 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 9 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 9 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 9 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 9 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 9 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask10_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 10 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 10 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 10 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 10 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask11_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 11 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 11 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 11 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 11 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 11 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 11 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 11 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 11 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 11 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 11 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask12_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 12 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 12 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask13_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 13 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 13 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 13 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 13 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 13 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 13 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 13 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 13 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 13 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 13 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 13 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 13 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask14_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 14 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 14 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 14 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 14 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 14 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 14 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask15_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 15 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 15 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 15 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 15 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 15 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 15 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 15 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 15 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 16 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 15 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 15 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 15 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 15 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 15 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 15 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 15 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask16_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask17_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 17 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 17 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 17 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 17 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 17 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 17 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 17 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 17 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 17 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 17 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 17 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 17 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 17 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 17 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 17 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 17 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask18_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 18 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 18 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 18 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 18 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 18 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 18 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 18 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 18 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask19_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 19 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 19 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 19 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 19 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 19 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 19 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 19 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 19 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 19 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 19 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 19 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 19 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 19 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 19 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 19 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 19 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 19 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 19 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask20_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 20 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 20 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 20 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 20 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask21_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 21 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 21 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 21 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 21 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 21 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 21 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 21 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 21 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 21 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 21 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 21 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 21 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 21 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 21 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 21 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 21 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 21 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 21 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 21 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 21 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask22_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 22 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 22 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 22 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 22 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 22 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 22 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 22 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 22 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 22 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 22 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask23_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 23 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 23 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 23 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 23 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 23 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 23 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 23 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 23 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 23 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 23 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 23 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 23 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 23 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 23 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 23 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 23 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 23 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 23 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 23 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 23 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 23 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 23 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask24_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 24 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 24 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask25_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 25 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 25 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 25 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 25 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 25 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 25 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 25 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 25 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 25 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 25 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 25 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 25 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 25 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 25 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 25 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 25 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 25 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 25 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 25 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 25 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 25 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 25 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 25 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 25 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask26_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 26 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 26 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 26 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 26 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 26 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 26 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 26 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 26 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 26 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 26 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 26 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 26 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask27_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 27 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 27 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 27 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 27 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 27 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 27 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 27 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 27 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 27 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 27 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 27 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 27 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 27 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 27 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 27 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 27 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 27 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 27 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 27 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 27 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 27 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 27 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 27 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 27 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 27 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 27 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask28_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 28 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 28 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 28 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 28 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 28 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 28 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask29_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 29 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 29 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 29 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 29 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 29 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 29 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 29 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 29 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 29 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 29 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 29 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 29 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 29 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 29 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 29 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 29 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 29 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 29 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 29 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 29 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 29 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 29 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 29 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 29 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 29 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 29 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 29 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 29 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask30_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 30 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 30 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 30 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 30 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 30 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 30 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 30 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 30 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 30 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 30 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 30 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 30 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 30 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 30 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask31_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++in; + *out |= ( (*in) ) << 31 ; + ++out; + *out = ( (*in) ) >> ( 31 - 30 ); + ++in; + *out |= ( (*in) ) << 30 ; + ++out; + *out = ( (*in) ) >> ( 31 - 29 ); + ++in; + *out |= ( (*in) ) << 29 ; + ++out; + *out = ( (*in) ) >> ( 31 - 28 ); + ++in; + *out |= ( (*in) ) << 28 ; + ++out; + *out = ( (*in) ) >> ( 31 - 27 ); + ++in; + *out |= ( (*in) ) << 27 ; + ++out; + *out = ( (*in) ) >> ( 31 - 26 ); + ++in; + *out |= ( (*in) ) << 26 ; + ++out; + *out = ( (*in) ) >> ( 31 - 25 ); + ++in; + *out |= ( (*in) ) << 25 ; + ++out; + *out = ( (*in) ) >> ( 31 - 24 ); + ++in; + *out |= ( (*in) ) << 24 ; + ++out; + *out = ( (*in) ) >> ( 31 - 23 ); + ++in; + *out |= ( (*in) ) << 23 ; + ++out; + *out = ( (*in) ) >> ( 31 - 22 ); + ++in; + *out |= ( (*in) ) << 22 ; + ++out; + *out = ( (*in) ) >> ( 31 - 21 ); + ++in; + *out |= ( (*in) ) << 21 ; + ++out; + *out = ( (*in) ) >> ( 31 - 20 ); + ++in; + *out |= ( (*in) ) << 20 ; + ++out; + *out = ( (*in) ) >> ( 31 - 19 ); + ++in; + *out |= ( (*in) ) << 19 ; + ++out; + *out = ( (*in) ) >> ( 31 - 18 ); + ++in; + *out |= ( (*in) ) << 18 ; + ++out; + *out = ( (*in) ) >> ( 31 - 17 ); + ++in; + *out |= ( (*in) ) << 17 ; + ++out; + *out = ( (*in) ) >> ( 31 - 16 ); + ++in; + *out |= ( (*in) ) << 16 ; + ++out; + *out = ( (*in) ) >> ( 31 - 15 ); + ++in; + *out |= ( (*in) ) << 15 ; + ++out; + *out = ( (*in) ) >> ( 31 - 14 ); + ++in; + *out |= ( (*in) ) << 14 ; + ++out; + *out = ( (*in) ) >> ( 31 - 13 ); + ++in; + *out |= ( (*in) ) << 13 ; + ++out; + *out = ( (*in) ) >> ( 31 - 12 ); + ++in; + *out |= ( (*in) ) << 12 ; + ++out; + *out = ( (*in) ) >> ( 31 - 11 ); + ++in; + *out |= ( (*in) ) << 11 ; + ++out; + *out = ( (*in) ) >> ( 31 - 10 ); + ++in; + *out |= ( (*in) ) << 10 ; + ++out; + *out = ( (*in) ) >> ( 31 - 9 ); + ++in; + *out |= ( (*in) ) << 9 ; + ++out; + *out = ( (*in) ) >> ( 31 - 8 ); + ++in; + *out |= ( (*in) ) << 8 ; + ++out; + *out = ( (*in) ) >> ( 31 - 7 ); + ++in; + *out |= ( (*in) ) << 7 ; + ++out; + *out = ( (*in) ) >> ( 31 - 6 ); + ++in; + *out |= ( (*in) ) << 6 ; + ++out; + *out = ( (*in) ) >> ( 31 - 5 ); + ++in; + *out |= ( (*in) ) << 5 ; + ++out; + *out = ( (*in) ) >> ( 31 - 4 ); + ++in; + *out |= ( (*in) ) << 4 ; + ++out; + *out = ( (*in) ) >> ( 31 - 3 ); + ++in; + *out |= ( (*in) ) << 3 ; + ++out; + *out = ( (*in) ) >> ( 31 - 2 ); + ++in; + *out |= ( (*in) ) << 2 ; + ++out; + *out = ( (*in) ) >> ( 31 - 1 ); + ++in; + *out |= ( (*in) ) << 1 ; + ++out; + ++in; + + return out; + } + + + + uint32_t * __fastpackwithoutmask32_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + *out = (*in) ; + ++out; + ++in; + + return out; + } + +#if 1 +#define DST(__x) out[__x] +#define DSI +#else +#define DST(__x) *out++ +#define DSI +#endif + +const uint32_t * __fastunpack1_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) & 1 ; + DSI; + DST( 1) = ( (*in) >> 1 ) & 1 ; + DSI; + DST( 2) = ( (*in) >> 2 ) & 1 ; + DSI; + DST( 3) = ( (*in) >> 3 ) & 1 ; + DSI; + DST( 4) = ( (*in) >> 4 ) & 1 ; + DSI; + DST( 5) = ( (*in) >> 5 ) & 1 ; + DSI; + DST( 6) = ( (*in) >> 6 ) & 1 ; + DSI; + DST( 7) = ( (*in) >> 7 ) & 1 ; + DSI; + DST( 8) = ( (*in) >> 8 ) & 1 ; + DSI; + DST( 9) = ( (*in) >> 9 ) & 1 ; + DSI; + DST(10) = ( (*in) >> 10 ) & 1 ; + DSI; + DST(11) = ( (*in) >> 11 ) & 1 ; + DSI; + DST(12) = ( (*in) >> 12 ) & 1 ; + DSI; + DST(13) = ( (*in) >> 13 ) & 1 ; + DSI; + DST(14) = ( (*in) >> 14 ) & 1 ; + DSI; + DST(15) = ( (*in) >> 15 ) & 1 ; + DSI; + DST(16) = ( (*in) >> 16 ) & 1 ; + DSI; + DST(17) = ( (*in) >> 17 ) & 1 ; + DSI; + DST(18) = ( (*in) >> 18 ) & 1 ; + DSI; + DST(19) = ( (*in) >> 19 ) & 1 ; + DSI; + DST(20) = ( (*in) >> 20 ) & 1 ; + DSI; + DST(21) = ( (*in) >> 21 ) & 1 ; + DSI; + DST(22) = ( (*in) >> 22 ) & 1 ; + DSI; + DST(23) = ( (*in) >> 23 ) & 1 ; + DSI; + DST(24) = ( (*in) >> 24 ) & 1 ; + DSI; + DST(25) = ( (*in) >> 25 ) & 1 ; + DSI; + DST(26) = ( (*in) >> 26 ) & 1 ; + DSI; + DST(27) = ( (*in) >> 27 ) & 1 ; + DSI; + DST(28) = ( (*in) >> 28 ) & 1 ; + DSI; + DST(29) = ( (*in) >> 29 ) & 1 ; + DSI; + DST(30) = ( (*in) >> 30 ) & 1 ; + DSI; + DST(31) = ( (*in) >> 31 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack2_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 2 ) ; + DSI; + DST( 1) = ( (*in) >> 2 ) % (1U << 2 ) ; + DSI; + DST( 2) = ( (*in) >> 4 ) % (1U << 2 ) ; + DSI; + DST( 3) = ( (*in) >> 6 ) % (1U << 2 ) ; + DSI; + DST( 4) = ( (*in) >> 8 ) % (1U << 2 ) ; + DSI; + DST( 5) = ( (*in) >> 10 ) % (1U << 2 ) ; + DSI; + DST( 6) = ( (*in) >> 12 ) % (1U << 2 ) ; + DSI; + DST( 7) = ( (*in) >> 14 ) % (1U << 2 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 2 ) ; + DSI; + DST( 9) = ( (*in) >> 18 ) % (1U << 2 ) ; + DSI; + DST(10) = ( (*in) >> 20 ) % (1U << 2 ) ; + DSI; + DST(11) = ( (*in) >> 22 ) % (1U << 2 ) ; + DSI; + DST(12) = ( (*in) >> 24 ) % (1U << 2 ) ; + DSI; + DST(13) = ( (*in) >> 26 ) % (1U << 2 ) ; + DSI; + DST(14) = ( (*in) >> 28 ) % (1U << 2 ) ; + DSI; + DST(15) = ( (*in) >> 30 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 2 ) ; + DSI; + DST(17) = ( (*in) >> 2 ) % (1U << 2 ) ; + DSI; + DST(18) = ( (*in) >> 4 ) % (1U << 2 ) ; + DSI; + DST(19) = ( (*in) >> 6 ) % (1U << 2 ) ; + DSI; + DST(20) = ( (*in) >> 8 ) % (1U << 2 ) ; + DSI; + DST(21) = ( (*in) >> 10 ) % (1U << 2 ) ; + DSI; + DST(22) = ( (*in) >> 12 ) % (1U << 2 ) ; + DSI; + DST(23) = ( (*in) >> 14 ) % (1U << 2 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 2 ) ; + DSI; + DST(25) = ( (*in) >> 18 ) % (1U << 2 ) ; + DSI; + DST(26) = ( (*in) >> 20 ) % (1U << 2 ) ; + DSI; + DST(27) = ( (*in) >> 22 ) % (1U << 2 ) ; + DSI; + DST(28) = ( (*in) >> 24 ) % (1U << 2 ) ; + DSI; + DST(29) = ( (*in) >> 26 ) % (1U << 2 ) ; + DSI; + DST(30) = ( (*in) >> 28 ) % (1U << 2 ) ; + DSI; + DST(31) = ( (*in) >> 30 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack3_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 3 ) ; + DSI; + DST( 1) = ( (*in) >> 3 ) % (1U << 3 ) ; + DSI; + DST( 2) = ( (*in) >> 6 ) % (1U << 3 ) ; + DSI; + DST( 3) = ( (*in) >> 9 ) % (1U << 3 ) ; + DSI; + DST( 4) = ( (*in) >> 12 ) % (1U << 3 ) ; + DSI; + DST( 5) = ( (*in) >> 15 ) % (1U << 3 ) ; + DSI; + DST( 6) = ( (*in) >> 18 ) % (1U << 3 ) ; + DSI; + DST( 7) = ( (*in) >> 21 ) % (1U << 3 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) % (1U << 3 ) ; + DSI; + DST( 9) = ( (*in) >> 27 ) % (1U << 3 ) ; + DSI; + DST(10) = ( (*in) >> 30 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 1 ))<<( 3 - 1 ); + DSI; + DST(11) = ( (*in) >> 1 ) % (1U << 3 ) ; + DSI; + DST(12) = ( (*in) >> 4 ) % (1U << 3 ) ; + DSI; + DST(13) = ( (*in) >> 7 ) % (1U << 3 ) ; + DSI; + DST(14) = ( (*in) >> 10 ) % (1U << 3 ) ; + DSI; + DST(15) = ( (*in) >> 13 ) % (1U << 3 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 3 ) ; + DSI; + DST(17) = ( (*in) >> 19 ) % (1U << 3 ) ; + DSI; + DST(18) = ( (*in) >> 22 ) % (1U << 3 ) ; + DSI; + DST(19) = ( (*in) >> 25 ) % (1U << 3 ) ; + DSI; + DST(20) = ( (*in) >> 28 ) % (1U << 3 ) ; + DSI; + DST(21) = ( (*in) >> 31 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 2 ))<<( 3 - 2 ); + DSI; + DST(22) = ( (*in) >> 2 ) % (1U << 3 ) ; + DSI; + DST(23) = ( (*in) >> 5 ) % (1U << 3 ) ; + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 3 ) ; + DSI; + DST(25) = ( (*in) >> 11 ) % (1U << 3 ) ; + DSI; + DST(26) = ( (*in) >> 14 ) % (1U << 3 ) ; + DSI; + DST(27) = ( (*in) >> 17 ) % (1U << 3 ) ; + DSI; + DST(28) = ( (*in) >> 20 ) % (1U << 3 ) ; + DSI; + DST(29) = ( (*in) >> 23 ) % (1U << 3 ) ; + DSI; + DST(30) = ( (*in) >> 26 ) % (1U << 3 ) ; + DSI; + DST(31) = ( (*in) >> 29 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack4_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST( 1) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST( 2) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST( 3) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST( 4) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST( 5) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST( 6) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST( 7) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST( 9) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(10) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(11) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(12) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(13) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(14) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(15) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST(17) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(18) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(19) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(20) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(21) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(22) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(23) = ( (*in) >> 28 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 4 ) ; + DSI; + DST(25) = ( (*in) >> 4 ) % (1U << 4 ) ; + DSI; + DST(26) = ( (*in) >> 8 ) % (1U << 4 ) ; + DSI; + DST(27) = ( (*in) >> 12 ) % (1U << 4 ) ; + DSI; + DST(28) = ( (*in) >> 16 ) % (1U << 4 ) ; + DSI; + DST(29) = ( (*in) >> 20 ) % (1U << 4 ) ; + DSI; + DST(30) = ( (*in) >> 24 ) % (1U << 4 ) ; + DSI; + DST(31) = ( (*in) >> 28 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack5_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 5 ) ; + DSI; + DST( 1) = ( (*in) >> 5 ) % (1U << 5 ) ; + DSI; + DST( 2) = ( (*in) >> 10 ) % (1U << 5 ) ; + DSI; + DST( 3) = ( (*in) >> 15 ) % (1U << 5 ) ; + DSI; + DST( 4) = ( (*in) >> 20 ) % (1U << 5 ) ; + DSI; + DST( 5) = ( (*in) >> 25 ) % (1U << 5 ) ; + DSI; + DST( 6) = ( (*in) >> 30 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 3 ))<<( 5 - 3 ); + DSI; + DST( 7) = ( (*in) >> 3 ) % (1U << 5 ) ; + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 5 ) ; + DSI; + DST( 9) = ( (*in) >> 13 ) % (1U << 5 ) ; + DSI; + DST(10) = ( (*in) >> 18 ) % (1U << 5 ) ; + DSI; + DST(11) = ( (*in) >> 23 ) % (1U << 5 ) ; + DSI; + DST(12) = ( (*in) >> 28 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 1 ))<<( 5 - 1 ); + DSI; + DST(13) = ( (*in) >> 1 ) % (1U << 5 ) ; + DSI; + DST(14) = ( (*in) >> 6 ) % (1U << 5 ) ; + DSI; + DST(15) = ( (*in) >> 11 ) % (1U << 5 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 5 ) ; + DSI; + DST(17) = ( (*in) >> 21 ) % (1U << 5 ) ; + DSI; + DST(18) = ( (*in) >> 26 ) % (1U << 5 ) ; + DSI; + DST(19) = ( (*in) >> 31 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 4 ))<<( 5 - 4 ); + DSI; + DST(20) = ( (*in) >> 4 ) % (1U << 5 ) ; + DSI; + DST(21) = ( (*in) >> 9 ) % (1U << 5 ) ; + DSI; + DST(22) = ( (*in) >> 14 ) % (1U << 5 ) ; + DSI; + DST(23) = ( (*in) >> 19 ) % (1U << 5 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) % (1U << 5 ) ; + DSI; + DST(25) = ( (*in) >> 29 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 2 ))<<( 5 - 2 ); + DSI; + DST(26) = ( (*in) >> 2 ) % (1U << 5 ) ; + DSI; + DST(27) = ( (*in) >> 7 ) % (1U << 5 ) ; + DSI; + DST(28) = ( (*in) >> 12 ) % (1U << 5 ) ; + DSI; + DST(29) = ( (*in) >> 17 ) % (1U << 5 ) ; + DSI; + DST(30) = ( (*in) >> 22 ) % (1U << 5 ) ; + DSI; + DST(31) = ( (*in) >> 27 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack6_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 6 ) ; + DSI; + DST( 1) = ( (*in) >> 6 ) % (1U << 6 ) ; + DSI; + DST( 2) = ( (*in) >> 12 ) % (1U << 6 ) ; + DSI; + DST( 3) = ( (*in) >> 18 ) % (1U << 6 ) ; + DSI; + DST( 4) = ( (*in) >> 24 ) % (1U << 6 ) ; + DSI; + DST( 5) = ( (*in) >> 30 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + DSI; + DST( 6) = ( (*in) >> 4 ) % (1U << 6 ) ; + DSI; + DST( 7) = ( (*in) >> 10 ) % (1U << 6 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 6 ) ; + DSI; + DST( 9) = ( (*in) >> 22 ) % (1U << 6 ) ; + DSI; + DST(10) = ( (*in) >> 28 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + DSI; + DST(11) = ( (*in) >> 2 ) % (1U << 6 ) ; + DSI; + DST(12) = ( (*in) >> 8 ) % (1U << 6 ) ; + DSI; + DST(13) = ( (*in) >> 14 ) % (1U << 6 ) ; + DSI; + DST(14) = ( (*in) >> 20 ) % (1U << 6 ) ; + DSI; + DST(15) = ( (*in) >> 26 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 6 ) ; + DSI; + DST(17) = ( (*in) >> 6 ) % (1U << 6 ) ; + DSI; + DST(18) = ( (*in) >> 12 ) % (1U << 6 ) ; + DSI; + DST(19) = ( (*in) >> 18 ) % (1U << 6 ) ; + DSI; + DST(20) = ( (*in) >> 24 ) % (1U << 6 ) ; + DSI; + DST(21) = ( (*in) >> 30 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 4 ))<<( 6 - 4 ); + DSI; + DST(22) = ( (*in) >> 4 ) % (1U << 6 ) ; + DSI; + DST(23) = ( (*in) >> 10 ) % (1U << 6 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 6 ) ; + DSI; + DST(25) = ( (*in) >> 22 ) % (1U << 6 ) ; + DSI; + DST(26) = ( (*in) >> 28 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 2 ))<<( 6 - 2 ); + DSI; + DST(27) = ( (*in) >> 2 ) % (1U << 6 ) ; + DSI; + DST(28) = ( (*in) >> 8 ) % (1U << 6 ) ; + DSI; + DST(29) = ( (*in) >> 14 ) % (1U << 6 ) ; + DSI; + DST(30) = ( (*in) >> 20 ) % (1U << 6 ) ; + DSI; + DST(31) = ( (*in) >> 26 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack7_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 7 ) ; + DSI; + DST( 1) = ( (*in) >> 7 ) % (1U << 7 ) ; + DSI; + DST( 2) = ( (*in) >> 14 ) % (1U << 7 ) ; + DSI; + DST( 3) = ( (*in) >> 21 ) % (1U << 7 ) ; + DSI; + DST( 4) = ( (*in) >> 28 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 3 ))<<( 7 - 3 ); + DSI; + DST( 5) = ( (*in) >> 3 ) % (1U << 7 ) ; + DSI; + DST( 6) = ( (*in) >> 10 ) % (1U << 7 ) ; + DSI; + DST( 7) = ( (*in) >> 17 ) % (1U << 7 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) % (1U << 7 ) ; + DSI; + DST( 9) = ( (*in) >> 31 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 6 ))<<( 7 - 6 ); + DSI; + DST(10) = ( (*in) >> 6 ) % (1U << 7 ) ; + DSI; + DST(11) = ( (*in) >> 13 ) % (1U << 7 ) ; + DSI; + DST(12) = ( (*in) >> 20 ) % (1U << 7 ) ; + DSI; + DST(13) = ( (*in) >> 27 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 2 ))<<( 7 - 2 ); + DSI; + DST(14) = ( (*in) >> 2 ) % (1U << 7 ) ; + DSI; + DST(15) = ( (*in) >> 9 ) % (1U << 7 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 7 ) ; + DSI; + DST(17) = ( (*in) >> 23 ) % (1U << 7 ) ; + DSI; + DST(18) = ( (*in) >> 30 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 5 ))<<( 7 - 5 ); + DSI; + DST(19) = ( (*in) >> 5 ) % (1U << 7 ) ; + DSI; + DST(20) = ( (*in) >> 12 ) % (1U << 7 ) ; + DSI; + DST(21) = ( (*in) >> 19 ) % (1U << 7 ) ; + DSI; + DST(22) = ( (*in) >> 26 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 1 ))<<( 7 - 1 ); + DSI; + DST(23) = ( (*in) >> 1 ) % (1U << 7 ) ; + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 7 ) ; + DSI; + DST(25) = ( (*in) >> 15 ) % (1U << 7 ) ; + DSI; + DST(26) = ( (*in) >> 22 ) % (1U << 7 ) ; + DSI; + DST(27) = ( (*in) >> 29 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 4 ))<<( 7 - 4 ); + DSI; + DST(28) = ( (*in) >> 4 ) % (1U << 7 ) ; + DSI; + DST(29) = ( (*in) >> 11 ) % (1U << 7 ) ; + DSI; + DST(30) = ( (*in) >> 18 ) % (1U << 7 ) ; + DSI; + DST(31) = ( (*in) >> 25 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack8_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 1) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST( 2) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST( 3) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST( 4) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 5) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST( 6) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST( 7) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST( 9) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(10) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(11) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(12) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(13) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(14) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(15) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(17) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(18) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(19) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(20) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(21) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(22) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(23) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(25) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(26) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(27) = ( (*in) >> 24 ) ; + ++in; + DSI; + DST(28) = ( (*in) >> 0 ) % (1U << 8 ) ; + DSI; + DST(29) = ( (*in) >> 8 ) % (1U << 8 ) ; + DSI; + DST(30) = ( (*in) >> 16 ) % (1U << 8 ) ; + DSI; + DST(31) = ( (*in) >> 24 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack9_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 9 ) ; + DSI; + DST( 1) = ( (*in) >> 9 ) % (1U << 9 ) ; + DSI; + DST( 2) = ( (*in) >> 18 ) % (1U << 9 ) ; + DSI; + DST( 3) = ( (*in) >> 27 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 4 ))<<( 9 - 4 ); + DSI; + DST( 4) = ( (*in) >> 4 ) % (1U << 9 ) ; + DSI; + DST( 5) = ( (*in) >> 13 ) % (1U << 9 ) ; + DSI; + DST( 6) = ( (*in) >> 22 ) % (1U << 9 ) ; + DSI; + DST( 7) = ( (*in) >> 31 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 9 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 9 ) ; + DSI; + DST( 9) = ( (*in) >> 17 ) % (1U << 9 ) ; + DSI; + DST(10) = ( (*in) >> 26 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 3 ))<<( 9 - 3 ); + DSI; + DST(11) = ( (*in) >> 3 ) % (1U << 9 ) ; + DSI; + DST(12) = ( (*in) >> 12 ) % (1U << 9 ) ; + DSI; + DST(13) = ( (*in) >> 21 ) % (1U << 9 ) ; + DSI; + DST(14) = ( (*in) >> 30 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 7 ))<<( 9 - 7 ); + DSI; + DST(15) = ( (*in) >> 7 ) % (1U << 9 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 9 ) ; + DSI; + DST(17) = ( (*in) >> 25 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 2 ))<<( 9 - 2 ); + DSI; + DST(18) = ( (*in) >> 2 ) % (1U << 9 ) ; + DSI; + DST(19) = ( (*in) >> 11 ) % (1U << 9 ) ; + DSI; + DST(20) = ( (*in) >> 20 ) % (1U << 9 ) ; + DSI; + DST(21) = ( (*in) >> 29 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 6 ))<<( 9 - 6 ); + DSI; + DST(22) = ( (*in) >> 6 ) % (1U << 9 ) ; + DSI; + DST(23) = ( (*in) >> 15 ) % (1U << 9 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 1 ))<<( 9 - 1 ); + DSI; + DST(25) = ( (*in) >> 1 ) % (1U << 9 ) ; + DSI; + DST(26) = ( (*in) >> 10 ) % (1U << 9 ) ; + DSI; + DST(27) = ( (*in) >> 19 ) % (1U << 9 ) ; + DSI; + DST(28) = ( (*in) >> 28 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 5 ))<<( 9 - 5 ); + DSI; + DST(29) = ( (*in) >> 5 ) % (1U << 9 ) ; + DSI; + DST(30) = ( (*in) >> 14 ) % (1U << 9 ) ; + DSI; + DST(31) = ( (*in) >> 23 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack10_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 10 ) ; + DSI; + DST( 1) = ( (*in) >> 10 ) % (1U << 10 ) ; + DSI; + DST( 2) = ( (*in) >> 20 ) % (1U << 10 ) ; + DSI; + DST( 3) = ( (*in) >> 30 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + DSI; + DST( 4) = ( (*in) >> 8 ) % (1U << 10 ) ; + DSI; + DST( 5) = ( (*in) >> 18 ) % (1U << 10 ) ; + DSI; + DST( 6) = ( (*in) >> 28 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + DSI; + DST( 7) = ( (*in) >> 6 ) % (1U << 10 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 10 ) ; + DSI; + DST( 9) = ( (*in) >> 26 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + DSI; + DST(10) = ( (*in) >> 4 ) % (1U << 10 ) ; + DSI; + DST(11) = ( (*in) >> 14 ) % (1U << 10 ) ; + DSI; + DST(12) = ( (*in) >> 24 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + DSI; + DST(13) = ( (*in) >> 2 ) % (1U << 10 ) ; + DSI; + DST(14) = ( (*in) >> 12 ) % (1U << 10 ) ; + DSI; + DST(15) = ( (*in) >> 22 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 10 ) ; + DSI; + DST(17) = ( (*in) >> 10 ) % (1U << 10 ) ; + DSI; + DST(18) = ( (*in) >> 20 ) % (1U << 10 ) ; + DSI; + DST(19) = ( (*in) >> 30 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 8 ))<<( 10 - 8 ); + DSI; + DST(20) = ( (*in) >> 8 ) % (1U << 10 ) ; + DSI; + DST(21) = ( (*in) >> 18 ) % (1U << 10 ) ; + DSI; + DST(22) = ( (*in) >> 28 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 6 ))<<( 10 - 6 ); + DSI; + DST(23) = ( (*in) >> 6 ) % (1U << 10 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 10 ) ; + DSI; + DST(25) = ( (*in) >> 26 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 4 ))<<( 10 - 4 ); + DSI; + DST(26) = ( (*in) >> 4 ) % (1U << 10 ) ; + DSI; + DST(27) = ( (*in) >> 14 ) % (1U << 10 ) ; + DSI; + DST(28) = ( (*in) >> 24 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 2 ))<<( 10 - 2 ); + DSI; + DST(29) = ( (*in) >> 2 ) % (1U << 10 ) ; + DSI; + DST(30) = ( (*in) >> 12 ) % (1U << 10 ) ; + DSI; + DST(31) = ( (*in) >> 22 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack11_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 11 ) ; + DSI; + DST( 1) = ( (*in) >> 11 ) % (1U << 11 ) ; + DSI; + DST( 2) = ( (*in) >> 22 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 1 ))<<( 11 - 1 ); + DSI; + DST( 3) = ( (*in) >> 1 ) % (1U << 11 ) ; + DSI; + DST( 4) = ( (*in) >> 12 ) % (1U << 11 ) ; + DSI; + DST( 5) = ( (*in) >> 23 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 2 ))<<( 11 - 2 ); + DSI; + DST( 6) = ( (*in) >> 2 ) % (1U << 11 ) ; + DSI; + DST( 7) = ( (*in) >> 13 ) % (1U << 11 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) ; + ++in; + DST( 8) |= ((*in) % (1U<< 3 ))<<( 11 - 3 ); + DSI; + DST( 9) = ( (*in) >> 3 ) % (1U << 11 ) ; + DSI; + DST(10) = ( (*in) >> 14 ) % (1U << 11 ) ; + DSI; + DST(11) = ( (*in) >> 25 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 4 ))<<( 11 - 4 ); + DSI; + DST(12) = ( (*in) >> 4 ) % (1U << 11 ) ; + DSI; + DST(13) = ( (*in) >> 15 ) % (1U << 11 ) ; + DSI; + DST(14) = ( (*in) >> 26 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 5 ))<<( 11 - 5 ); + DSI; + DST(15) = ( (*in) >> 5 ) % (1U << 11 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 11 ) ; + DSI; + DST(17) = ( (*in) >> 27 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 6 ))<<( 11 - 6 ); + DSI; + DST(18) = ( (*in) >> 6 ) % (1U << 11 ) ; + DSI; + DST(19) = ( (*in) >> 17 ) % (1U << 11 ) ; + DSI; + DST(20) = ( (*in) >> 28 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 7 ))<<( 11 - 7 ); + DSI; + DST(21) = ( (*in) >> 7 ) % (1U << 11 ) ; + DSI; + DST(22) = ( (*in) >> 18 ) % (1U << 11 ) ; + DSI; + DST(23) = ( (*in) >> 29 ) ; + ++in; + DST(23) |= ((*in) % (1U<< 8 ))<<( 11 - 8 ); + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 11 ) ; + DSI; + DST(25) = ( (*in) >> 19 ) % (1U << 11 ) ; + DSI; + DST(26) = ( (*in) >> 30 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 9 ))<<( 11 - 9 ); + DSI; + DST(27) = ( (*in) >> 9 ) % (1U << 11 ) ; + DSI; + DST(28) = ( (*in) >> 20 ) % (1U << 11 ) ; + DSI; + DST(29) = ( (*in) >> 31 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 10 ))<<( 11 - 10 ); + DSI; + DST(30) = ( (*in) >> 10 ) % (1U << 11 ) ; + DSI; + DST(31) = ( (*in) >> 21 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack12_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST( 1) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST( 2) = ( (*in) >> 24 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST( 3) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST( 4) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST( 5) = ( (*in) >> 28 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST( 6) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST( 7) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST( 9) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(10) = ( (*in) >> 24 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(11) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(12) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(13) = ( (*in) >> 28 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(14) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(15) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST(17) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(18) = ( (*in) >> 24 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(19) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(20) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(21) = ( (*in) >> 28 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(22) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(23) = ( (*in) >> 20 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 12 ) ; + DSI; + DST(25) = ( (*in) >> 12 ) % (1U << 12 ) ; + DSI; + DST(26) = ( (*in) >> 24 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 4 ))<<( 12 - 4 ); + DSI; + DST(27) = ( (*in) >> 4 ) % (1U << 12 ) ; + DSI; + DST(28) = ( (*in) >> 16 ) % (1U << 12 ) ; + DSI; + DST(29) = ( (*in) >> 28 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 8 ))<<( 12 - 8 ); + DSI; + DST(30) = ( (*in) >> 8 ) % (1U << 12 ) ; + DSI; + DST(31) = ( (*in) >> 20 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack13_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 13 ) ; + DSI; + DST( 1) = ( (*in) >> 13 ) % (1U << 13 ) ; + DSI; + DST( 2) = ( (*in) >> 26 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 7 ))<<( 13 - 7 ); + DSI; + DST( 3) = ( (*in) >> 7 ) % (1U << 13 ) ; + DSI; + DST( 4) = ( (*in) >> 20 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 1 ))<<( 13 - 1 ); + DSI; + DST( 5) = ( (*in) >> 1 ) % (1U << 13 ) ; + DSI; + DST( 6) = ( (*in) >> 14 ) % (1U << 13 ) ; + DSI; + DST( 7) = ( (*in) >> 27 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 13 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 13 ) ; + DSI; + DST( 9) = ( (*in) >> 21 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 2 ))<<( 13 - 2 ); + DSI; + DST(10) = ( (*in) >> 2 ) % (1U << 13 ) ; + DSI; + DST(11) = ( (*in) >> 15 ) % (1U << 13 ) ; + DSI; + DST(12) = ( (*in) >> 28 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 9 ))<<( 13 - 9 ); + DSI; + DST(13) = ( (*in) >> 9 ) % (1U << 13 ) ; + DSI; + DST(14) = ( (*in) >> 22 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 3 ))<<( 13 - 3 ); + DSI; + DST(15) = ( (*in) >> 3 ) % (1U << 13 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 13 ) ; + DSI; + DST(17) = ( (*in) >> 29 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 10 ))<<( 13 - 10 ); + DSI; + DST(18) = ( (*in) >> 10 ) % (1U << 13 ) ; + DSI; + DST(19) = ( (*in) >> 23 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 4 ))<<( 13 - 4 ); + DSI; + DST(20) = ( (*in) >> 4 ) % (1U << 13 ) ; + DSI; + DST(21) = ( (*in) >> 17 ) % (1U << 13 ) ; + DSI; + DST(22) = ( (*in) >> 30 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 11 ))<<( 13 - 11 ); + DSI; + DST(23) = ( (*in) >> 11 ) % (1U << 13 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 5 ))<<( 13 - 5 ); + DSI; + DST(25) = ( (*in) >> 5 ) % (1U << 13 ) ; + DSI; + DST(26) = ( (*in) >> 18 ) % (1U << 13 ) ; + DSI; + DST(27) = ( (*in) >> 31 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 12 ))<<( 13 - 12 ); + DSI; + DST(28) = ( (*in) >> 12 ) % (1U << 13 ) ; + DSI; + DST(29) = ( (*in) >> 25 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 6 ))<<( 13 - 6 ); + DSI; + DST(30) = ( (*in) >> 6 ) % (1U << 13 ) ; + DSI; + DST(31) = ( (*in) >> 19 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack14_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 14 ) ; + DSI; + DST( 1) = ( (*in) >> 14 ) % (1U << 14 ) ; + DSI; + DST( 2) = ( (*in) >> 28 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + DSI; + DST( 3) = ( (*in) >> 10 ) % (1U << 14 ) ; + DSI; + DST( 4) = ( (*in) >> 24 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + DSI; + DST( 5) = ( (*in) >> 6 ) % (1U << 14 ) ; + DSI; + DST( 6) = ( (*in) >> 20 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + DSI; + DST( 7) = ( (*in) >> 2 ) % (1U << 14 ) ; + DSI; + DST( 8) = ( (*in) >> 16 ) % (1U << 14 ) ; + DSI; + DST( 9) = ( (*in) >> 30 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + DSI; + DST(10) = ( (*in) >> 12 ) % (1U << 14 ) ; + DSI; + DST(11) = ( (*in) >> 26 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + DSI; + DST(12) = ( (*in) >> 8 ) % (1U << 14 ) ; + DSI; + DST(13) = ( (*in) >> 22 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + DSI; + DST(14) = ( (*in) >> 4 ) % (1U << 14 ) ; + DSI; + DST(15) = ( (*in) >> 18 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 14 ) ; + DSI; + DST(17) = ( (*in) >> 14 ) % (1U << 14 ) ; + DSI; + DST(18) = ( (*in) >> 28 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 10 ))<<( 14 - 10 ); + DSI; + DST(19) = ( (*in) >> 10 ) % (1U << 14 ) ; + DSI; + DST(20) = ( (*in) >> 24 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 6 ))<<( 14 - 6 ); + DSI; + DST(21) = ( (*in) >> 6 ) % (1U << 14 ) ; + DSI; + DST(22) = ( (*in) >> 20 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 2 ))<<( 14 - 2 ); + DSI; + DST(23) = ( (*in) >> 2 ) % (1U << 14 ) ; + DSI; + DST(24) = ( (*in) >> 16 ) % (1U << 14 ) ; + DSI; + DST(25) = ( (*in) >> 30 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 12 ))<<( 14 - 12 ); + DSI; + DST(26) = ( (*in) >> 12 ) % (1U << 14 ) ; + DSI; + DST(27) = ( (*in) >> 26 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 8 ))<<( 14 - 8 ); + DSI; + DST(28) = ( (*in) >> 8 ) % (1U << 14 ) ; + DSI; + DST(29) = ( (*in) >> 22 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 4 ))<<( 14 - 4 ); + DSI; + DST(30) = ( (*in) >> 4 ) % (1U << 14 ) ; + DSI; + DST(31) = ( (*in) >> 18 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack15_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 15 ) ; + DSI; + DST( 1) = ( (*in) >> 15 ) % (1U << 15 ) ; + DSI; + DST( 2) = ( (*in) >> 30 ) ; + ++in; + DST( 2) |= ((*in) % (1U<< 13 ))<<( 15 - 13 ); + DSI; + DST( 3) = ( (*in) >> 13 ) % (1U << 15 ) ; + DSI; + DST( 4) = ( (*in) >> 28 ) ; + ++in; + DST( 4) |= ((*in) % (1U<< 11 ))<<( 15 - 11 ); + DSI; + DST( 5) = ( (*in) >> 11 ) % (1U << 15 ) ; + DSI; + DST( 6) = ( (*in) >> 26 ) ; + ++in; + DST( 6) |= ((*in) % (1U<< 9 ))<<( 15 - 9 ); + DSI; + DST( 7) = ( (*in) >> 9 ) % (1U << 15 ) ; + DSI; + DST( 8) = ( (*in) >> 24 ) ; + ++in; + DST( 8) |= ((*in) % (1U<< 7 ))<<( 15 - 7 ); + DSI; + DST( 9) = ( (*in) >> 7 ) % (1U << 15 ) ; + DSI; + DST(10) = ( (*in) >> 22 ) ; + ++in; + DST(10) |= ((*in) % (1U<< 5 ))<<( 15 - 5 ); + DSI; + DST(11) = ( (*in) >> 5 ) % (1U << 15 ) ; + DSI; + DST(12) = ( (*in) >> 20 ) ; + ++in; + DST(12) |= ((*in) % (1U<< 3 ))<<( 15 - 3 ); + DSI; + DST(13) = ( (*in) >> 3 ) % (1U << 15 ) ; + DSI; + DST(14) = ( (*in) >> 18 ) ; + ++in; + DST(14) |= ((*in) % (1U<< 1 ))<<( 15 - 1 ); + DSI; + DST(15) = ( (*in) >> 1 ) % (1U << 15 ) ; + DSI; + DST(16) = ( (*in) >> 16 ) % (1U << 15 ) ; + DSI; + DST(17) = ( (*in) >> 31 ) ; + ++in; + DST(17) |= ((*in) % (1U<< 14 ))<<( 15 - 14 ); + DSI; + DST(18) = ( (*in) >> 14 ) % (1U << 15 ) ; + DSI; + DST(19) = ( (*in) >> 29 ) ; + ++in; + DST(19) |= ((*in) % (1U<< 12 ))<<( 15 - 12 ); + DSI; + DST(20) = ( (*in) >> 12 ) % (1U << 15 ) ; + DSI; + DST(21) = ( (*in) >> 27 ) ; + ++in; + DST(21) |= ((*in) % (1U<< 10 ))<<( 15 - 10 ); + DSI; + DST(22) = ( (*in) >> 10 ) % (1U << 15 ) ; + DSI; + DST(23) = ( (*in) >> 25 ) ; + ++in; + DST(23) |= ((*in) % (1U<< 8 ))<<( 15 - 8 ); + DSI; + DST(24) = ( (*in) >> 8 ) % (1U << 15 ) ; + DSI; + DST(25) = ( (*in) >> 23 ) ; + ++in; + DST(25) |= ((*in) % (1U<< 6 ))<<( 15 - 6 ); + DSI; + DST(26) = ( (*in) >> 6 ) % (1U << 15 ) ; + DSI; + DST(27) = ( (*in) >> 21 ) ; + ++in; + DST(27) |= ((*in) % (1U<< 4 ))<<( 15 - 4 ); + DSI; + DST(28) = ( (*in) >> 4 ) % (1U << 15 ) ; + DSI; + DST(29) = ( (*in) >> 19 ) ; + ++in; + DST(29) |= ((*in) % (1U<< 2 ))<<( 15 - 2 ); + DSI; + DST(30) = ( (*in) >> 2 ) % (1U << 15 ) ; + DSI; + DST(31) = ( (*in) >> 17 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack16_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 1) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 2) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 3) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 4) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 5) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 6) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 7) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST( 8) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST( 9) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(10) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(11) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(12) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(13) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(14) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(15) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(16) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(17) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(18) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(19) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(20) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(21) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(22) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(23) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(24) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(25) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(26) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(27) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(28) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(29) = ( (*in) >> 16 ) ; + ++in; + DSI; + DST(30) = ( (*in) >> 0 ) % (1U << 16 ) ; + DSI; + DST(31) = ( (*in) >> 16 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack17_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + DST( 0) = ( (*in) >> 0 ) % (1U << 17 ) ; + DSI; + DST( 1) = ( (*in) >> 17 ) ; + ++in; + DST( 1) |= ((*in) % (1U<< 2 ))<<( 17 - 2 ); + DSI; + DST( 2) = ( (*in) >> 2 ) % (1U << 17 ) ; + DSI; + DST( 3) = ( (*in) >> 19 ) ; + ++in; + DST( 3) |= ((*in) % (1U<< 4 ))<<( 17 - 4 ); + DSI; + DST( 4) = ( (*in) >> 4 ) % (1U << 17 ) ; + DSI; + DST( 5) = ( (*in) >> 21 ) ; + ++in; + DST( 5) |= ((*in) % (1U<< 6 ))<<( 17 - 6 ); + DSI; + DST( 6) = ( (*in) >> 6 ) % (1U << 17 ) ; + DSI; + DST( 7) = ( (*in) >> 23 ) ; + ++in; + DST( 7) |= ((*in) % (1U<< 8 ))<<( 17 - 8 ); + DSI; + DST( 8) = ( (*in) >> 8 ) % (1U << 17 ) ; + DSI; + DST( 9) = ( (*in) >> 25 ) ; + ++in; + DST( 9) |= ((*in) % (1U<< 10 ))<<( 17 - 10 ); + DSI; + DST(10) = ( (*in) >> 10 ) % (1U << 17 ) ; + DSI; + DST(11) = ( (*in) >> 27 ) ; + ++in; + DST(11) |= ((*in) % (1U<< 12 ))<<( 17 - 12 ); + DSI; + DST(12) = ( (*in) >> 12 ) % (1U << 17 ) ; + DSI; + DST(13) = ( (*in) >> 29 ) ; + ++in; + DST(13) |= ((*in) % (1U<< 14 ))<<( 17 - 14 ); + DSI; + DST(14) = ( (*in) >> 14 ) % (1U << 17 ) ; + DSI; + DST(15) = ( (*in) >> 31 ) ; + ++in; + DST(15) |= ((*in) % (1U<< 16 ))<<( 17 - 16 ); + DSI; + DST(16) = ( (*in) >> 16 ) ; + ++in; + DST(16) |= ((*in) % (1U<< 1 ))<<( 17 - 1 ); + DSI; + DST(17) = ( (*in) >> 1 ) % (1U << 17 ) ; + DSI; + DST(18) = ( (*in) >> 18 ) ; + ++in; + DST(18) |= ((*in) % (1U<< 3 ))<<( 17 - 3 ); + DSI; + DST(19) = ( (*in) >> 3 ) % (1U << 17 ) ; + DSI; + DST(20) = ( (*in) >> 20 ) ; + ++in; + DST(20) |= ((*in) % (1U<< 5 ))<<( 17 - 5 ); + DSI; + DST(21) = ( (*in) >> 5 ) % (1U << 17 ) ; + DSI; + DST(22) = ( (*in) >> 22 ) ; + ++in; + DST(22) |= ((*in) % (1U<< 7 ))<<( 17 - 7 ); + DSI; + DST(23) = ( (*in) >> 7 ) % (1U << 17 ) ; + DSI; + DST(24) = ( (*in) >> 24 ) ; + ++in; + DST(24) |= ((*in) % (1U<< 9 ))<<( 17 - 9 ); + DSI; + DST(25) = ( (*in) >> 9 ) % (1U << 17 ) ; + DSI; + DST(26) = ( (*in) >> 26 ) ; + ++in; + DST(26) |= ((*in) % (1U<< 11 ))<<( 17 - 11 ); + DSI; + DST(27) = ( (*in) >> 11 ) % (1U << 17 ) ; + DSI; + DST(28) = ( (*in) >> 28 ) ; + ++in; + DST(28) |= ((*in) % (1U<< 13 ))<<( 17 - 13 ); + DSI; + DST(29) = ( (*in) >> 13 ) % (1U << 17 ) ; + DSI; + DST(30) = ( (*in) >> 30 ) ; + ++in; + DST(30) |= ((*in) % (1U<< 15 ))<<( 17 - 15 ); + DSI; + DST(31) = ( (*in) >> 15 ) ; + ++in; + DSI; + + return in; + } + + + + +const uint32_t * __fastunpack18_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 18 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 18 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 18 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 18 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 18 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 18 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 18 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 18 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 18 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack19_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 19 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 19 - 12 ); + out++; + *out = ( (*in) >> 12 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 19 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 19 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 19 - 11 ); + out++; + *out = ( (*in) >> 11 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 19 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 19 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 19 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 19 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 19 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 19 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 19 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 19 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 19 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 19 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 19 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 19 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 19 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 19 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack20_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 20 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 20 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 20 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 20 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 20 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack21_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 21 - 10 ); + out++; + *out = ( (*in) >> 10 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 21 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 21 - 9 ); + out++; + *out = ( (*in) >> 9 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 21 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 21 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 21 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 21 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 21 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 21 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 21 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 21 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 21 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 21 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 21 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 21 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 21 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 21 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 21 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 21 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 21 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 21 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack22_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 22 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 22 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 22 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 22 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 22 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 22 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 22 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 22 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 22 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 22 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 22 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack23_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 23 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 23 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 23 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 23 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 23 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 23 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 23 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 23 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 23 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 23 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 23 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 23 - 7 ); + out++; + *out = ( (*in) >> 7 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 23 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 23 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 23 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 23 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 23 - 8 ); + out++; + *out = ( (*in) >> 8 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 23 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 23 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 23 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 23 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 23 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 23 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack24_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 24 ) ; + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 24 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 24 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack25_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 25 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 25 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 25 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 25 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 25 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 25 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 25 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 25 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 25 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 25 - 5 ); + out++; + *out = ( (*in) >> 5 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 25 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 25 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 25 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 25 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 25 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 25 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 25 - 6 ); + out++; + *out = ( (*in) >> 6 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 25 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 25 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 25 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 25 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 25 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 25 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 25 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 25 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack26_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 26 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 26 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 26 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 26 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 26 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 26 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 26 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 26 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 26 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 26 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 26 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 26 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 26 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack27_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 27 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 27 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 27 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 27 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 27 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 27 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 27 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 27 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 27 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 27 - 4 ); + out++; + *out = ( (*in) >> 4 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 27 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 27 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 27 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 27 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 27 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 27 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 27 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 27 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 27 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 27 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 27 - 3 ); + out++; + *out = ( (*in) >> 3 ) % (1U << 27 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 27 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 27 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 27 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 27 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 27 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack28_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 28 ) ; + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 28 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 28 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 28 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 28 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 28 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 28 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack29_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 29 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 29 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 29 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 29 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 29 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 29 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 29 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 29 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 29 - 2 ); + out++; + *out = ( (*in) >> 2 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 29 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 29 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 29 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 29 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 29 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 29 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 29 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 29 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 29 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 29 - 1 ); + out++; + *out = ( (*in) >> 1 ) % (1U << 29 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 29 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 29 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 29 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 29 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 29 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 29 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 29 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 29 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 29 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack30_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) % (1U << 30 ) ; + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 30 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 30 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 30 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 30 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 30 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 30 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 30 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 30 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 30 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 30 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 30 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 30 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 30 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 30 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack31_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) % (1U << 31 ) ; + out++; + *out = ( (*in) >> 31 ) ; + ++in; + *out |= ((*in) % (1U<< 30 ))<<( 31 - 30 ); + out++; + *out = ( (*in) >> 30 ) ; + ++in; + *out |= ((*in) % (1U<< 29 ))<<( 31 - 29 ); + out++; + *out = ( (*in) >> 29 ) ; + ++in; + *out |= ((*in) % (1U<< 28 ))<<( 31 - 28 ); + out++; + *out = ( (*in) >> 28 ) ; + ++in; + *out |= ((*in) % (1U<< 27 ))<<( 31 - 27 ); + out++; + *out = ( (*in) >> 27 ) ; + ++in; + *out |= ((*in) % (1U<< 26 ))<<( 31 - 26 ); + out++; + *out = ( (*in) >> 26 ) ; + ++in; + *out |= ((*in) % (1U<< 25 ))<<( 31 - 25 ); + out++; + *out = ( (*in) >> 25 ) ; + ++in; + *out |= ((*in) % (1U<< 24 ))<<( 31 - 24 ); + out++; + *out = ( (*in) >> 24 ) ; + ++in; + *out |= ((*in) % (1U<< 23 ))<<( 31 - 23 ); + out++; + *out = ( (*in) >> 23 ) ; + ++in; + *out |= ((*in) % (1U<< 22 ))<<( 31 - 22 ); + out++; + *out = ( (*in) >> 22 ) ; + ++in; + *out |= ((*in) % (1U<< 21 ))<<( 31 - 21 ); + out++; + *out = ( (*in) >> 21 ) ; + ++in; + *out |= ((*in) % (1U<< 20 ))<<( 31 - 20 ); + out++; + *out = ( (*in) >> 20 ) ; + ++in; + *out |= ((*in) % (1U<< 19 ))<<( 31 - 19 ); + out++; + *out = ( (*in) >> 19 ) ; + ++in; + *out |= ((*in) % (1U<< 18 ))<<( 31 - 18 ); + out++; + *out = ( (*in) >> 18 ) ; + ++in; + *out |= ((*in) % (1U<< 17 ))<<( 31 - 17 ); + out++; + *out = ( (*in) >> 17 ) ; + ++in; + *out |= ((*in) % (1U<< 16 ))<<( 31 - 16 ); + out++; + *out = ( (*in) >> 16 ) ; + ++in; + *out |= ((*in) % (1U<< 15 ))<<( 31 - 15 ); + out++; + *out = ( (*in) >> 15 ) ; + ++in; + *out |= ((*in) % (1U<< 14 ))<<( 31 - 14 ); + out++; + *out = ( (*in) >> 14 ) ; + ++in; + *out |= ((*in) % (1U<< 13 ))<<( 31 - 13 ); + out++; + *out = ( (*in) >> 13 ) ; + ++in; + *out |= ((*in) % (1U<< 12 ))<<( 31 - 12 ); + out++; + *out = ( (*in) >> 12 ) ; + ++in; + *out |= ((*in) % (1U<< 11 ))<<( 31 - 11 ); + out++; + *out = ( (*in) >> 11 ) ; + ++in; + *out |= ((*in) % (1U<< 10 ))<<( 31 - 10 ); + out++; + *out = ( (*in) >> 10 ) ; + ++in; + *out |= ((*in) % (1U<< 9 ))<<( 31 - 9 ); + out++; + *out = ( (*in) >> 9 ) ; + ++in; + *out |= ((*in) % (1U<< 8 ))<<( 31 - 8 ); + out++; + *out = ( (*in) >> 8 ) ; + ++in; + *out |= ((*in) % (1U<< 7 ))<<( 31 - 7 ); + out++; + *out = ( (*in) >> 7 ) ; + ++in; + *out |= ((*in) % (1U<< 6 ))<<( 31 - 6 ); + out++; + *out = ( (*in) >> 6 ) ; + ++in; + *out |= ((*in) % (1U<< 5 ))<<( 31 - 5 ); + out++; + *out = ( (*in) >> 5 ) ; + ++in; + *out |= ((*in) % (1U<< 4 ))<<( 31 - 4 ); + out++; + *out = ( (*in) >> 4 ) ; + ++in; + *out |= ((*in) % (1U<< 3 ))<<( 31 - 3 ); + out++; + *out = ( (*in) >> 3 ) ; + ++in; + *out |= ((*in) % (1U<< 2 ))<<( 31 - 2 ); + out++; + *out = ( (*in) >> 2 ) ; + ++in; + *out |= ((*in) % (1U<< 1 ))<<( 31 - 1 ); + out++; + *out = ( (*in) >> 1 ) ; + ++in; + out++; + + return in; + } + + + + +const uint32_t * __fastunpack32_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out) { + + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + *out = ( (*in) >> 0 ) ; + ++in; + out++; + + return in; + } + + + + const uint32_t * fastunpack_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullunpacker32(in,out); + + case 1: + return __fastunpack1_32(in,out); + + case 2: + return __fastunpack2_32(in,out); + + case 3: + return __fastunpack3_32(in,out); + + case 4: + return __fastunpack4_32(in,out); + + case 5: + return __fastunpack5_32(in,out); + + case 6: + return __fastunpack6_32(in,out); + + case 7: + return __fastunpack7_32(in,out); + + case 8: + return __fastunpack8_32(in,out); + + case 9: + return __fastunpack9_32(in,out); + + case 10: + return __fastunpack10_32(in,out); + + case 11: + return __fastunpack11_32(in,out); + + case 12: + return __fastunpack12_32(in,out); + + case 13: + return __fastunpack13_32(in,out); + + case 14: + return __fastunpack14_32(in,out); + + case 15: + return __fastunpack15_32(in,out); + + case 16: + return __fastunpack16_32(in,out); + + case 17: + return __fastunpack17_32(in,out); + + case 18: + return __fastunpack18_32(in,out); + + case 19: + return __fastunpack19_32(in,out); + + case 20: + return __fastunpack20_32(in,out); + + case 21: + return __fastunpack21_32(in,out); + + case 22: + return __fastunpack22_32(in,out); + + case 23: + return __fastunpack23_32(in,out); + + case 24: + return __fastunpack24_32(in,out); + + case 25: + return __fastunpack25_32(in,out); + + case 26: + return __fastunpack26_32(in,out); + + case 27: + return __fastunpack27_32(in,out); + + case 28: + return __fastunpack28_32(in,out); + + case 29: + return __fastunpack29_32(in,out); + + case 30: + return __fastunpack30_32(in,out); + + case 31: + return __fastunpack31_32(in,out); + + case 32: + return __fastunpack32_32(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + + + + /*assumes that integers fit in the prescribed number of bits*/ + uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit) { + switch(bit) { + case 0: return nullpacker(in,out); + + case 1: + return __fastpackwithoutmask1_32(in,out); + + case 2: + return __fastpackwithoutmask2_32(in,out); + + case 3: + return __fastpackwithoutmask3_32(in,out); + + case 4: + return __fastpackwithoutmask4_32(in,out); + + case 5: + return __fastpackwithoutmask5_32(in,out); + + case 6: + return __fastpackwithoutmask6_32(in,out); + + case 7: + return __fastpackwithoutmask7_32(in,out); + + case 8: + return __fastpackwithoutmask8_32(in,out); + + case 9: + return __fastpackwithoutmask9_32(in,out); + + case 10: + return __fastpackwithoutmask10_32(in,out); + + case 11: + return __fastpackwithoutmask11_32(in,out); + + case 12: + return __fastpackwithoutmask12_32(in,out); + + case 13: + return __fastpackwithoutmask13_32(in,out); + + case 14: + return __fastpackwithoutmask14_32(in,out); + + case 15: + return __fastpackwithoutmask15_32(in,out); + + case 16: + return __fastpackwithoutmask16_32(in,out); + + case 17: + return __fastpackwithoutmask17_32(in,out); + + case 18: + return __fastpackwithoutmask18_32(in,out); + + case 19: + return __fastpackwithoutmask19_32(in,out); + + case 20: + return __fastpackwithoutmask20_32(in,out); + + case 21: + return __fastpackwithoutmask21_32(in,out); + + case 22: + return __fastpackwithoutmask22_32(in,out); + + case 23: + return __fastpackwithoutmask23_32(in,out); + + case 24: + return __fastpackwithoutmask24_32(in,out); + + case 25: + return __fastpackwithoutmask25_32(in,out); + + case 26: + return __fastpackwithoutmask26_32(in,out); + + case 27: + return __fastpackwithoutmask27_32(in,out); + + case 28: + return __fastpackwithoutmask28_32(in,out); + + case 29: + return __fastpackwithoutmask29_32(in,out); + + case 30: + return __fastpackwithoutmask30_32(in,out); + + case 31: + return __fastpackwithoutmask31_32(in,out); + + case 32: + return __fastpackwithoutmask32_32(in,out); + + default: + break; + } + //throw logic_error("number of bits is unsupported"); + } + diff --git a/aux/simdcomp/bitpacka.h b/aux/simdcomp/bitpacka.h new file mode 100644 index 0000000..5efce66 --- /dev/null +++ b/aux/simdcomp/bitpacka.h @@ -0,0 +1,28 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ +#ifndef BITPACKINGALIGNED +#define BITPACKINGALIGNED +#include +#include +#include + +const uint32_t * fastunpack_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); +uint32_t * fastpackwithoutmask_8(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +const uint32_t * fastunpack_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); +uint32_t * fastpackwithoutmask_16(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +const uint32_t * fastunpack_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); +uint32_t * fastpackwithoutmask_24(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +const uint32_t * fastunpack_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + +uint32_t * fastpackwithoutmask_32(const uint32_t * __restrict__ in, uint32_t * __restrict__ out, const uint32_t bit); + + + +#endif // BITPACKINGALIGNED diff --git a/aux/simdcomp/bitpacka.o b/aux/simdcomp/bitpacka.o new file mode 100644 index 0000000..a778450 Binary files /dev/null and b/aux/simdcomp/bitpacka.o differ diff --git a/aux/simdcomp/example.c b/aux/simdcomp/example.c new file mode 100644 index 0000000..0394e20 --- /dev/null +++ b/aux/simdcomp/example.c @@ -0,0 +1,66 @@ +#include +#include +#include "simdcomp.h" + + +// compresses data from datain to buffer, returns how many bytes written +size_t compress(uint32_t * datain, size_t length, uint8_t * buffer) { + if(length/SIMDBlockSize*SIMDBlockSize != length) { + printf("Data length should be a multiple of %i \n",SIMDBlockSize); + } + uint32_t offset = 0; + uint8_t * initout = buffer; + for(size_t k = 0; k < length / SIMDBlockSize; ++k) { + uint32_t b = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + *buffer++ = b; + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, (__m128i *) buffer, + b); + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + buffer += b * sizeof(__m128i); + } + return buffer - initout; +} + + +int main() { + int REPEAT = 5; + int N = 1000000 * SIMDBlockSize;//SIMDBlockSize is 128 + uint32_t * datain = malloc(N * sizeof(uint32_t)); + size_t compsize; + clock_t start, end; + + uint8_t * buffer = malloc(N * sizeof(uint32_t) + N / SIMDBlockSize); // output buffer + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (int gap = 1; gap <= 243; gap *= 3) { + printf("\n"); + printf(" gap = %u \n", gap); + for (int k = 0; k < N; ++k) + datain[k] = k * gap; + uint32_t offset = 0; + compsize = compress(datain,N,buffer); + printf("compression rate = %f \n", (N * sizeof(uint32_t))/ (compsize * 1.0 )); + start = clock(); + uint32_t bogus = 0; + for(int repeat = 0; repeat < REPEAT; ++repeat) { + uint8_t * decbuffer = buffer; + for (int k = 0; k * SIMDBlockSize < N; ++k) { + uint8_t b = *decbuffer++; + simdunpackd1(offset, (__m128i *) decbuffer, backbuffer, b); + // do something here with backbuffer + bogus += backbuffer[3]; + decbuffer += b * sizeof(__m128i); + offset = backbuffer[SIMDBlockSize - 1]; + } + } + end = clock(); + double numberofseconds = (end-start)/(double)CLOCKS_PER_SEC; + printf("decoding speed in million of integers per second %f \n",N*REPEAT/(numberofseconds*1000.0*1000.0)); + printf("ignore me %i \n",bogus); + } + free(buffer); + free(datain); + free(backbuffer); + return 0; +} + diff --git a/aux/simdcomp/include/simdbitpacking.h b/aux/simdcomp/include/simdbitpacking.h new file mode 100644 index 0000000..301f4f5 --- /dev/null +++ b/aux/simdcomp/include/simdbitpacking.h @@ -0,0 +1,21 @@ +/** + * This code is released under a BSD License. + */ +#ifndef SIMDBITPACKING_H_ +#define SIMDBITPACKING_H_ + +#include // SSE2 is required +#include // use a C99-compliant compiler, please +#include // for memset + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +void simdpack(const uint32_t * in,__m128i * out, uint32_t bit); + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +void simdpackwithoutmask(const uint32_t * in,__m128i * out, uint32_t bit); + +//reads "bit" 128-bit vectors from "in", writes 128 values to "out" +void simdunpack(const __m128i * in,uint32_t * out, uint32_t bit); + + +#endif /* SIMDBITPACKING_H_ */ diff --git a/aux/simdcomp/include/simdcomp.h b/aux/simdcomp/include/simdcomp.h new file mode 100644 index 0000000..8875f0f --- /dev/null +++ b/aux/simdcomp/include/simdcomp.h @@ -0,0 +1,12 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMDCOMP_H_ +#define SIMDCOMP_H_ + +#include "simdbitpacking.h" +#include "simdcomputil.h" +#include "simdintegratedbitpacking.h" + +#endif diff --git a/aux/simdcomp/include/simdcomputil.h b/aux/simdcomp/include/simdcomputil.h new file mode 100644 index 0000000..107665b --- /dev/null +++ b/aux/simdcomp/include/simdcomputil.h @@ -0,0 +1,29 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMDCOMPUTIL_H_ +#define SIMDCOMPUTIL_H_ + +#include // SSE2 is required +#include // use a C99-compliant compiler, please + + + + +// returns the integer logarithm of v (bit width) +uint32_t bits(const uint32_t v); + +// max integer logarithm over a range of SIMDBlockSize integers (128 integer) +uint32_t maxbits(const uint32_t * begin); + +enum{ SIMDBlockSize = 128}; + +// like maxbit over 128 integers (SIMDBlockSize) with provided initial value +// and using differential coding +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in); + + + + +#endif /* SIMDCOMPUTIL_H_ */ diff --git a/aux/simdcomp/include/simdintegratedbitpacking.h b/aux/simdcomp/include/simdintegratedbitpacking.h new file mode 100644 index 0000000..18ca795 --- /dev/null +++ b/aux/simdcomp/include/simdintegratedbitpacking.h @@ -0,0 +1,27 @@ +/** + * This code is released under a BSD License. + */ + +#ifndef SIMD_INTEGRATED_BITPACKING_H +#define SIMD_INTEGRATED_BITPACKING_H + +#include // SSE2 is required +#include // use a C99-compliant compiler, please + +#include "simdcomputil.h" + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +// integer values should be in sorted order (for best results) +void simdpackd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); + + +//reads 128 values from "in", writes "bit" 128-bit vectors to "out" +// integer values should be in sorted order (for best results) +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in,__m128i * out, uint32_t bit); + + +//reads "bit" 128-bit vectors from "in", writes 128 values to "out" +void simdunpackd1(uint32_t initvalue, const __m128i * in,uint32_t * out, uint32_t bit); + + +#endif diff --git a/aux/simdcomp/makefile b/aux/simdcomp/makefile new file mode 100644 index 0000000..6ebd9d9 --- /dev/null +++ b/aux/simdcomp/makefile @@ -0,0 +1,54 @@ +# minimalist makefile +.SUFFIXES: +# +.SUFFIXES: .cpp .o .c .h + +CFLAGS = -fPIC -std=c99 -O3 -Wall -Wextra -Wno-unused-parameter -pedantic +LDFLAGS = -shared +LIBNAME=libsimdcomp.so.0.0.3 +all: unit $(LIBNAME) +test: + ./unit +install: $(OBJECTS) + cp $(LIBNAME) /usr/local/lib + ln -s /usr/local/lib/$(LIBNAME) /usr/local/lib/libsimdcomp.so + ldconfig + cp $(HEADERS) /usr/local/include + + + +HEADERS=./include/simdbitpacking.h ./include/simdcomputil.h ./include/simdintegratedbitpacking.h ./include/simdcomp.h + +uninstall: + for h in $(HEADERS) ; do rm /usr/local/$$h; done + rm /usr/local/lib/$(LIBNAME) + rm /usr/local/lib/libsimdcomp.so + ldconfig + + +OBJECTS= simdbitpacking.o simdintegratedbitpacking.o simdcomputil.o + +$(LIBNAME): $(OBJECTS) + $(CC) $(CFLAGS) -o $(LIBNAME) $(OBJECTS) $(LDFLAGS) + + + +simdcomputil.o: ./src/simdcomputil.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdcomputil.c -Iinclude + +simdbitpacking.o: ./src/simdbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdbitpacking.c -Iinclude + +simdintegratedbitpacking.o: ./src/simdintegratedbitpacking.c $(HEADERS) + $(CC) $(CFLAGS) -c ./src/simdintegratedbitpacking.c -Iinclude + +example: ./example.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o example ./example.c -Iinclude $(OBJECTS) + +unit: ./src/unit.c $(HEADERS) $(OBJECTS) + $(CC) $(CFLAGS) -o unit ./src/unit.c -Iinclude $(OBJECTS) +dynunit: ./src/unit.c $(HEADERS) $(LIBNAME) + $(CC) $(CFLAGS) -o dynunit ./src/unit.c -Iinclude -lsimdcomp + +clean: + rm -f unit *.o $(LIBNAME) diff --git a/aux/simdcomp/src/simdbitpacking.c b/aux/simdcomp/src/simdbitpacking.c new file mode 100644 index 0000000..556a845 --- /dev/null +++ b/aux/simdcomp/src/simdbitpacking.c @@ -0,0 +1,14008 @@ +/** + * This code is released under a BSD License. + */ +#include "../include/simdbitpacking.h" + + +static void SIMD_nullunpacker32(const __m128i * _in , uint32_t * out) { + memset(out,0,32 * 4 * 4); +} + +static void __SIMD_fastpackwithoutmask1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpackwithoutmask4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_loadu_si128(in+4); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+5); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_loadu_si128(in+6); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_loadu_si128(in+7); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpackwithoutmask8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_loadu_si128(in+2); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_loadu_si128(in+3); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpackwithoutmask16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + __m128i InReg; + uint32_t outer; + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_loadu_si128(in); + OutReg = InReg; + + InReg = _mm_loadu_si128(in+1); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + +static void __SIMD_fastpack1_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<1)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack2_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack3_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack5_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack6_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack7_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack9_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack10_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack11_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack12_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack13_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack14_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack15_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack17_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack18_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack19_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack20_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack21_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack22_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack23_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack24_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack25_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack26_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack27_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack28_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack29_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack30_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack31_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + __m128i InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + InReg = _mm_and_si128(_mm_loadu_si128(++in), mask); + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack32_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + ++out; + InReg = _mm_loadu_si128(++in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + +static void __SIMD_fastpack4_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + uint32_t outer; + for(outer=0; outer< 4 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+4), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+5), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+6), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+7), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=8; + } + +} + + + +static void __SIMD_fastpack8_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + uint32_t outer; + for(outer=0; outer< 8 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+2), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + + InReg = _mm_and_si128(_mm_loadu_si128(in+3), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=4; + } + +} + + + +static void __SIMD_fastpack16_32(const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg, InReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + uint32_t outer; + for(outer=0; outer< 16 ;++outer) { + InReg = _mm_and_si128(_mm_loadu_si128(in), mask); + OutReg = InReg; + + InReg = _mm_and_si128(_mm_loadu_si128(in+1), mask); + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + ++out; + + in+=2; + } + +} + + + + +static void __SIMD_fastunpack1_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + __m128i InReg1 = _mm_loadu_si128(in); + __m128i InReg2 = InReg1; + __m128i OutReg1, OutReg2, OutReg3, OutReg4; + const __m128i mask = _mm_set1_epi32(1); + + unsigned shift = 0; + unsigned i; + for (i = 0; i < 8; ++i) { + OutReg1 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg2 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + OutReg3 = _mm_and_si128( _mm_srli_epi32(InReg1,shift++) , mask); + OutReg4 = _mm_and_si128( _mm_srli_epi32(InReg2,shift++) , mask); + _mm_storeu_si128(out++, OutReg1); + _mm_storeu_si128(out++, OutReg2); + _mm_storeu_si128(out++, OutReg3); + _mm_storeu_si128(out++, OutReg4); + } +} + + + + +static void __SIMD_fastunpack2_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<2)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack3_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<3)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,27) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,28) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack4_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<4)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack5_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<5)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,25) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,26) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack6_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<6)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack7_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<7)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,24) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,23) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack8_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<8)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack9_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<9)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,22) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,21) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack10_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<10)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack11_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<11)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,19) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,20) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack12_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<12)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack13_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<13)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,17) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,18) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack14_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<14)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack15_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<15)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,15) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,16) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack16_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<16)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack17_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<17)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,14) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,13) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack18_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<18)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack19_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<19)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,12) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,11) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack20_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<20)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack21_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<21)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,10) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,9) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack22_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<22)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack23_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<23)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,7) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,8) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack24_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<24)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack25_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<25)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,5) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,6) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack26_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<26)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack27_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<27)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,4) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,3) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack28_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<28)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack29_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<29)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,2) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( _mm_srli_epi32(InReg,1) , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack30_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<30)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + _mm_storeu_si128(out++, OutReg); + + +} + + + + +static void __SIMD_fastunpack31_32(const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + const __m128i mask = _mm_set1_epi32((1U<<31)-1); + + OutReg = _mm_and_si128( InReg , mask); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,31) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,30) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,29) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,28) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,27) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,26) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,25) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,24) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,23) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,22) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,21) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,20) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,19) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,18) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,17) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,16) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,15) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,14) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,13) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,12) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,11) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,10) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,9) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,8) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,7) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,6) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,5) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,4) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,3) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,2) ; + InReg = _mm_loadu_si128(++in); + + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + _mm_storeu_si128(out++, OutReg); + + OutReg = _mm_srli_epi32(InReg,1) ; + _mm_storeu_si128(out++, OutReg); + + +} + + +void __SIMD_fastunpack32_32(const __m128i* in, uint32_t * _out) { + __m128i* out = (__m128i*)(_out); + uint32_t outer; + for(outer=0; outer< 32 ;++outer) { + _mm_storeu_si128(out++, _mm_loadu_si128(in++)); + } +} + + + +void simdunpack(const __m128i * in, uint32_t * out, const uint32_t bit) { + switch(bit) { + case 0: SIMD_nullunpacker32(in,out); return; + + case 1: __SIMD_fastunpack1_32(in,out); return; + + case 2: __SIMD_fastunpack2_32(in,out); return; + + case 3: __SIMD_fastunpack3_32(in,out); return; + + case 4: __SIMD_fastunpack4_32(in,out); return; + + case 5: __SIMD_fastunpack5_32(in,out); return; + + case 6: __SIMD_fastunpack6_32(in,out); return; + + case 7: __SIMD_fastunpack7_32(in,out); return; + + case 8: __SIMD_fastunpack8_32(in,out); return; + + case 9: __SIMD_fastunpack9_32(in,out); return; + + case 10: __SIMD_fastunpack10_32(in,out); return; + + case 11: __SIMD_fastunpack11_32(in,out); return; + + case 12: __SIMD_fastunpack12_32(in,out); return; + + case 13: __SIMD_fastunpack13_32(in,out); return; + + case 14: __SIMD_fastunpack14_32(in,out); return; + + case 15: __SIMD_fastunpack15_32(in,out); return; + + case 16: __SIMD_fastunpack16_32(in,out); return; + + case 17: __SIMD_fastunpack17_32(in,out); return; + + case 18: __SIMD_fastunpack18_32(in,out); return; + + case 19: __SIMD_fastunpack19_32(in,out); return; + + case 20: __SIMD_fastunpack20_32(in,out); return; + + case 21: __SIMD_fastunpack21_32(in,out); return; + + case 22: __SIMD_fastunpack22_32(in,out); return; + + case 23: __SIMD_fastunpack23_32(in,out); return; + + case 24: __SIMD_fastunpack24_32(in,out); return; + + case 25: __SIMD_fastunpack25_32(in,out); return; + + case 26: __SIMD_fastunpack26_32(in,out); return; + + case 27: __SIMD_fastunpack27_32(in,out); return; + + case 28: __SIMD_fastunpack28_32(in,out); return; + + case 29: __SIMD_fastunpack29_32(in,out); return; + + case 30: __SIMD_fastunpack30_32(in,out); return; + + case 31: __SIMD_fastunpack31_32(in,out); return; + + case 32: __SIMD_fastunpack32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpackwithoutmask(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpackwithoutmask1_32(in,out); return; + + case 2: __SIMD_fastpackwithoutmask2_32(in,out); return; + + case 3: __SIMD_fastpackwithoutmask3_32(in,out); return; + + case 4: __SIMD_fastpackwithoutmask4_32(in,out); return; + + case 5: __SIMD_fastpackwithoutmask5_32(in,out); return; + + case 6: __SIMD_fastpackwithoutmask6_32(in,out); return; + + case 7: __SIMD_fastpackwithoutmask7_32(in,out); return; + + case 8: __SIMD_fastpackwithoutmask8_32(in,out); return; + + case 9: __SIMD_fastpackwithoutmask9_32(in,out); return; + + case 10: __SIMD_fastpackwithoutmask10_32(in,out); return; + + case 11: __SIMD_fastpackwithoutmask11_32(in,out); return; + + case 12: __SIMD_fastpackwithoutmask12_32(in,out); return; + + case 13: __SIMD_fastpackwithoutmask13_32(in,out); return; + + case 14: __SIMD_fastpackwithoutmask14_32(in,out); return; + + case 15: __SIMD_fastpackwithoutmask15_32(in,out); return; + + case 16: __SIMD_fastpackwithoutmask16_32(in,out); return; + + case 17: __SIMD_fastpackwithoutmask17_32(in,out); return; + + case 18: __SIMD_fastpackwithoutmask18_32(in,out); return; + + case 19: __SIMD_fastpackwithoutmask19_32(in,out); return; + + case 20: __SIMD_fastpackwithoutmask20_32(in,out); return; + + case 21: __SIMD_fastpackwithoutmask21_32(in,out); return; + + case 22: __SIMD_fastpackwithoutmask22_32(in,out); return; + + case 23: __SIMD_fastpackwithoutmask23_32(in,out); return; + + case 24: __SIMD_fastpackwithoutmask24_32(in,out); return; + + case 25: __SIMD_fastpackwithoutmask25_32(in,out); return; + + case 26: __SIMD_fastpackwithoutmask26_32(in,out); return; + + case 27: __SIMD_fastpackwithoutmask27_32(in,out); return; + + case 28: __SIMD_fastpackwithoutmask28_32(in,out); return; + + case 29: __SIMD_fastpackwithoutmask29_32(in,out); return; + + case 30: __SIMD_fastpackwithoutmask30_32(in,out); return; + + case 31: __SIMD_fastpackwithoutmask31_32(in,out); return; + + case 32: __SIMD_fastpackwithoutmask32_32(in,out); return; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ +void simdpack(const uint32_t * in, __m128i * out, const uint32_t bit) { + switch(bit) { + case 0: return; + + case 1: __SIMD_fastpack1_32(in,out); return; + + case 2: __SIMD_fastpack2_32(in,out); return; + + case 3: __SIMD_fastpack3_32(in,out); return; + + case 4: __SIMD_fastpack4_32(in,out); return; + + case 5: __SIMD_fastpack5_32(in,out); return; + + case 6: __SIMD_fastpack6_32(in,out); return; + + case 7: __SIMD_fastpack7_32(in,out); return; + + case 8: __SIMD_fastpack8_32(in,out); return; + + case 9: __SIMD_fastpack9_32(in,out); return; + + case 10: __SIMD_fastpack10_32(in,out); return; + + case 11: __SIMD_fastpack11_32(in,out); return; + + case 12: __SIMD_fastpack12_32(in,out); return; + + case 13: __SIMD_fastpack13_32(in,out); return; + + case 14: __SIMD_fastpack14_32(in,out); return; + + case 15: __SIMD_fastpack15_32(in,out); return; + + case 16: __SIMD_fastpack16_32(in,out); return; + + case 17: __SIMD_fastpack17_32(in,out); return; + + case 18: __SIMD_fastpack18_32(in,out); return; + + case 19: __SIMD_fastpack19_32(in,out); return; + + case 20: __SIMD_fastpack20_32(in,out); return; + + case 21: __SIMD_fastpack21_32(in,out); return; + + case 22: __SIMD_fastpack22_32(in,out); return; + + case 23: __SIMD_fastpack23_32(in,out); return; + + case 24: __SIMD_fastpack24_32(in,out); return; + + case 25: __SIMD_fastpack25_32(in,out); return; + + case 26: __SIMD_fastpack26_32(in,out); return; + + case 27: __SIMD_fastpack27_32(in,out); return; + + case 28: __SIMD_fastpack28_32(in,out); return; + + case 29: __SIMD_fastpack29_32(in,out); return; + + case 30: __SIMD_fastpack30_32(in,out); return; + + case 31: __SIMD_fastpack31_32(in,out); return; + + case 32: __SIMD_fastpack32_32(in,out); return; + + default: break; + } +} + + + diff --git a/aux/simdcomp/src/simdbitpacking.o b/aux/simdcomp/src/simdbitpacking.o new file mode 100644 index 0000000..b582a09 Binary files /dev/null and b/aux/simdcomp/src/simdbitpacking.o differ diff --git a/aux/simdcomp/src/simdcomputil.c b/aux/simdcomp/src/simdcomputil.c new file mode 100644 index 0000000..9b36da5 --- /dev/null +++ b/aux/simdcomp/src/simdcomputil.c @@ -0,0 +1,56 @@ +#include "../include/simdcomputil.h" + +__attribute__((always_inline)) +static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +} + + +// returns the integer logarithm of v (bit width) +uint32_t bits(const uint32_t v) { +#ifdef _MSC_VER + if (v == 0) { + return 0; + } + unsigned long answer; + _BitScanReverse(&answer, v); + return answer + 1; +#else + return v == 0 ? 0 : 32 - __builtin_clz(v); // assume GCC-like compiler if not microsoft +#endif +} + +__attribute__ ((pure)) +uint32_t maxbits(const uint32_t * begin) { + uint32_t accumulator = 0;const uint32_t * k; + for (k = begin; k != begin + SIMDBlockSize; ++k) { + accumulator |= *k; + } + return bits(accumulator); +} + +static uint32_t maxbitas32int(const __m128i accumulator) { + uint32_t tmparray[4]; + _mm_storeu_si128((__m128i *) (tmparray), accumulator); + return bits(tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]); +} + + +// maxbit over 128 integers (SIMDBlockSize) with provided initial value +uint32_t simdmaxbitsd1(uint32_t initvalue, const uint32_t * in) { + __m128i initoffset = _mm_set1_epi32 (initvalue); + const __m128i* pin = (const __m128i*)(in); + __m128i newvec = _mm_loadu_si128(pin); + __m128i accumulator = Delta(newvec , initoffset); + __m128i oldvec = newvec; + uint32_t k; + for(k = 1; 4*k < SIMDBlockSize; ++k) { + newvec = _mm_loadu_si128(pin+k); + accumulator = _mm_or_si128(accumulator,Delta(newvec , oldvec)); + oldvec = newvec; + } + initoffset = oldvec; + return maxbitas32int(accumulator); +} + diff --git a/aux/simdcomp/src/simdcomputil.o b/aux/simdcomp/src/simdcomputil.o new file mode 100644 index 0000000..6957faf Binary files /dev/null and b/aux/simdcomp/src/simdcomputil.o differ diff --git a/aux/simdcomp/src/simdintegratedbitpacking.c b/aux/simdcomp/src/simdintegratedbitpacking.c new file mode 100644 index 0000000..82e5d19 --- /dev/null +++ b/aux/simdcomp/src/simdintegratedbitpacking.c @@ -0,0 +1,24863 @@ +/** + * This code is released under a BSD License. + */ +#include "../include/simdintegratedbitpacking.h" + +__attribute__((always_inline)) +static inline __m128i Delta(__m128i curr, __m128i prev) { + return _mm_sub_epi32(curr, + _mm_or_si128(_mm_slli_si128(curr, 4), _mm_srli_si128(prev, 12))); +} + +__attribute__((always_inline)) +static inline __m128i PrefixSum(__m128i curr, __m128i prev) { + const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); + const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); + return _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); +} + + +__m128i iunpack0(__m128i initOffset, const __m128i * _in , uint32_t * _out) { + __m128i *out = (__m128i*)(_out); + const __m128i zero = _mm_set1_epi32 (0); + unsigned i; + for (i = 0; i < 8; ++i) { + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + initOffset = PrefixSum(zero, initOffset); + _mm_storeu_si128(out++, initOffset); + } + + return initOffset; +} + + + + +void ipackwithoutmask0(__m128i initOffset , const uint32_t * _in , __m128i * out) { + +} + + +void ipack0(__m128i initOffset , const uint32_t * _in , __m128i * out ) { +} + + + +void ipackwithoutmask1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack1(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack2(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(3U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack3(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(7U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 3 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack4(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(15U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack5(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(31U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 5 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack6(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(63U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 6 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack7(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(127U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 7 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack8(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(255U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack9(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(511U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 9 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack10(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1023U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 10 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack11(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2047U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 11 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack12(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(4095U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 12 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack13(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(8191U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 13 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack14(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(16383U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 14 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack15(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(32767U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 15 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack16(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(65535U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack17(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(131071U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 17 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack18(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(262143U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 18 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack19(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(524287U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 19 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack20(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1048575U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 20 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack21(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2097151U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 21 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack22(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(4194303U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 22 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack23(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(8388607U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 23 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack24(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(16777215U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 24 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack25(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(33554431U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 25 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack26(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(67108863U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 26 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack27(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(134217727U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 27 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack28(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(268435455U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 28 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack29(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(536870911U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 29 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack30(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(1073741823U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 30 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = Delta(CurrIn, initOffset); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack31(__m128i initOffset, const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + const __m128i mask = _mm_set1_epi32(2147483647U); ; + + __m128i CurrIn = _mm_loadu_si128(in); + __m128i InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + OutReg = InReg; + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 31)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 30); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 30)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 29); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 29)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 28); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 28)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 27); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 27)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 26); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 26)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 25); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 25)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 24); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 24)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 23); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 23)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 22); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 22)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 21); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 21)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 20); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 20)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 19); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 19)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 18); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 18)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 17); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 17)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 16); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 16)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 15); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 15)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 14); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 14)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 13); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 13)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 12); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 12)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 11); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 11)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 10); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 10)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 9); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 9)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 8); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 8)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 7); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 7)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 6); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 6)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 5); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 5)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 4); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 4)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 3); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 3)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 2); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 2)); + _mm_storeu_si128(out, OutReg); + + ++out; + OutReg = _mm_srli_epi32(InReg, 31 - 1); + ++in; + CurrIn = _mm_loadu_si128(in); + InReg = _mm_and_si128(Delta(CurrIn, initOffset), mask); + initOffset = CurrIn; + + OutReg = _mm_or_si128(OutReg,_mm_slli_epi32(InReg, 1)); + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipackwithoutmask32(__m128i initOffset , const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + +void ipack32(__m128i initOffset , const uint32_t * _in, __m128i * out) { + const __m128i *in = (const __m128i*)(_in); + __m128i OutReg; + + + + __m128i InReg = _mm_loadu_si128(in); + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + ++out; + ++in; + InReg = _mm_loadu_si128(in); + + OutReg = InReg; + _mm_storeu_si128(out, OutReg); + + +} + + + + + +__m128i iunpack1(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<1)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack2(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<2)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack3(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<3)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack4(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<4)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack5(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<5)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack6(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<6)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack7(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<7)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack8(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<8)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack9(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<9)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack10(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<10)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack11(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<11)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack12(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<12)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack13(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<13)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack14(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<14)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack15(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<15)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack16(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<16)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack17(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<17)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack18(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<18)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack19(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<19)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack20(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<20)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack21(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<21)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack22(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<22)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack23(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<23)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack24(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<24)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack25(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<25)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack26(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<26)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack27(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<27)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack28(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<28)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack29(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<29)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack30(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<30)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + + +__m128i iunpack31(__m128i initOffset, const __m128i* in, uint32_t * _out) { + + __m128i* out = (__m128i*)(_out); + __m128i InReg = _mm_loadu_si128(in); + __m128i OutReg; + __m128i tmp; + __m128i mask = _mm_set1_epi32((1U<<31)-1); + + + + tmp = InReg; + OutReg = _mm_and_si128(tmp, mask); + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,31); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,30); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,29); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,28); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,27); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,26); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,25); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,24); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,23); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,22); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,21); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,20); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,19); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,18); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,17); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,16); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,15); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,14); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,13); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,12); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,11); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,10); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,9); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,8); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,7); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,6); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,5); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,4); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,3); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,2); + OutReg = tmp; + ++in; InReg = _mm_loadu_si128(in); + OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); + + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + tmp = _mm_srli_epi32(InReg,1); + OutReg = tmp; + OutReg = PrefixSum(OutReg, initOffset); + initOffset = OutReg; + _mm_storeu_si128(out++, OutReg); + + + return initOffset; + +} + + + + +__m128i iunpack32(__m128i initOffset, const __m128i* in, uint32_t * _out) { + __m128i * mout = (__m128i *)(_out); + __m128i invec; + size_t k; + for(k = 0; k < 128/4; ++k) { + invec = _mm_loadu_si128(in++); + _mm_storeu_si128(mout++, invec); + } + return invec; +} + + + + + void simdunpackd1(uint32_t initvalue, const __m128i * in, uint32_t * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: iunpack0(initOffset,in,out); break; + + case 1: iunpack1(initOffset,in,out); break; + + case 2: iunpack2(initOffset,in,out); break; + + case 3: iunpack3(initOffset,in,out); break; + + case 4: iunpack4(initOffset,in,out); break; + + case 5: iunpack5(initOffset,in,out); break; + + case 6: iunpack6(initOffset,in,out); break; + + case 7: iunpack7(initOffset,in,out); break; + + case 8: iunpack8(initOffset,in,out); break; + + case 9: iunpack9(initOffset,in,out); break; + + case 10: iunpack10(initOffset,in,out); break; + + case 11: iunpack11(initOffset,in,out); break; + + case 12: iunpack12(initOffset,in,out); break; + + case 13: iunpack13(initOffset,in,out); break; + + case 14: iunpack14(initOffset,in,out); break; + + case 15: iunpack15(initOffset,in,out); break; + + case 16: iunpack16(initOffset,in,out); break; + + case 17: iunpack17(initOffset,in,out); break; + + case 18: iunpack18(initOffset,in,out); break; + + case 19: iunpack19(initOffset,in,out); break; + + case 20: iunpack20(initOffset,in,out); break; + + case 21: iunpack21(initOffset,in,out); break; + + case 22: iunpack22(initOffset,in,out); break; + + case 23: iunpack23(initOffset,in,out); break; + + case 24: iunpack24(initOffset,in,out); break; + + case 25: iunpack25(initOffset,in,out); break; + + case 26: iunpack26(initOffset,in,out); break; + + case 27: iunpack27(initOffset,in,out); break; + + case 28: iunpack28(initOffset,in,out); break; + + case 29: iunpack29(initOffset,in,out); break; + + case 30: iunpack30(initOffset,in,out); break; + + case 31: iunpack31(initOffset,in,out); break; + + case 32: iunpack32(initOffset,in,out); break; + + default: break; + } +} + + + + /*assumes that integers fit in the prescribed number of bits*/ + +void simdpackwithoutmaskd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: break; + + case 1: ipackwithoutmask1(initOffset,in,out); break; + + case 2: ipackwithoutmask2(initOffset,in,out); break; + + case 3: ipackwithoutmask3(initOffset,in,out); break; + + case 4: ipackwithoutmask4(initOffset,in,out); break; + + case 5: ipackwithoutmask5(initOffset,in,out); break; + + case 6: ipackwithoutmask6(initOffset,in,out); break; + + case 7: ipackwithoutmask7(initOffset,in,out); break; + + case 8: ipackwithoutmask8(initOffset,in,out); break; + + case 9: ipackwithoutmask9(initOffset,in,out); break; + + case 10: ipackwithoutmask10(initOffset,in,out); break; + + case 11: ipackwithoutmask11(initOffset,in,out); break; + + case 12: ipackwithoutmask12(initOffset,in,out); break; + + case 13: ipackwithoutmask13(initOffset,in,out); break; + + case 14: ipackwithoutmask14(initOffset,in,out); break; + + case 15: ipackwithoutmask15(initOffset,in,out); break; + + case 16: ipackwithoutmask16(initOffset,in,out); break; + + case 17: ipackwithoutmask17(initOffset,in,out); break; + + case 18: ipackwithoutmask18(initOffset,in,out); break; + + case 19: ipackwithoutmask19(initOffset,in,out); break; + + case 20: ipackwithoutmask20(initOffset,in,out); break; + + case 21: ipackwithoutmask21(initOffset,in,out); break; + + case 22: ipackwithoutmask22(initOffset,in,out); break; + + case 23: ipackwithoutmask23(initOffset,in,out); break; + + case 24: ipackwithoutmask24(initOffset,in,out); break; + + case 25: ipackwithoutmask25(initOffset,in,out); break; + + case 26: ipackwithoutmask26(initOffset,in,out); break; + + case 27: ipackwithoutmask27(initOffset,in,out); break; + + case 28: ipackwithoutmask28(initOffset,in,out); break; + + case 29: ipackwithoutmask29(initOffset,in,out); break; + + case 30: ipackwithoutmask30(initOffset,in,out); break; + + case 31: ipackwithoutmask31(initOffset,in,out); break; + + case 32: ipackwithoutmask32(initOffset,in,out); break; + + default: break; + } +} + + + + +void simdpackd1(uint32_t initvalue, const uint32_t * in, __m128i * out, const uint32_t bit) { + __m128i initOffset = _mm_set1_epi32 (initvalue); + switch(bit) { + case 0: break;; + + case 1: ipack1(initOffset, in,out); break; + + case 2: ipack2(initOffset, in,out); break; + + case 3: ipack3(initOffset, in,out); break; + + case 4: ipack4(initOffset, in,out); break; + + case 5: ipack5(initOffset, in,out); break; + + case 6: ipack6(initOffset, in,out); break; + + case 7: ipack7(initOffset, in,out); break; + + case 8: ipack8(initOffset, in,out); break; + + case 9: ipack9(initOffset, in,out); break; + + case 10: ipack10(initOffset, in,out); break; + + case 11: ipack11(initOffset, in,out); break; + + case 12: ipack12(initOffset, in,out); break; + + case 13: ipack13(initOffset, in,out); break; + + case 14: ipack14(initOffset, in,out); break; + + case 15: ipack15(initOffset, in,out); break; + + case 16: ipack16(initOffset, in,out); break; + + case 17: ipack17(initOffset, in,out); break; + + case 18: ipack18(initOffset, in,out); break; + + case 19: ipack19(initOffset, in,out); break; + + case 20: ipack20(initOffset, in,out); break; + + case 21: ipack21(initOffset, in,out); break; + + case 22: ipack22(initOffset, in,out); break; + + case 23: ipack23(initOffset, in,out); break; + + case 24: ipack24(initOffset, in,out); break; + + case 25: ipack25(initOffset, in,out); break; + + case 26: ipack26(initOffset, in,out); break; + + case 27: ipack27(initOffset, in,out); break; + + case 28: ipack28(initOffset, in,out); break; + + case 29: ipack29(initOffset, in,out); break; + + case 30: ipack30(initOffset, in,out); break; + + case 31: ipack31(initOffset, in,out); break; + + case 32: ipack32(initOffset, in,out); break; + + default: break; + } +} + diff --git a/aux/simdcomp/src/simdintegratedbitpacking.o b/aux/simdcomp/src/simdintegratedbitpacking.o new file mode 100644 index 0000000..4c33433 Binary files /dev/null and b/aux/simdcomp/src/simdintegratedbitpacking.o differ diff --git a/aux/simdcomp/src/unit.c b/aux/simdcomp/src/unit.c new file mode 100644 index 0000000..826f447 --- /dev/null +++ b/aux/simdcomp/src/unit.c @@ -0,0 +1,63 @@ +/** + * This code is released under a BSD License. + */ +#include +#include +#include "simdcomp.h" + + +int main() { + int N = 5000 * SIMDBlockSize; + __m128i * buffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + uint32_t * datain = malloc(N * sizeof(uint32_t)); + uint32_t * backbuffer = malloc(SIMDBlockSize * sizeof(uint32_t)); + for (int gap = 1; gap <= 387420489; gap *= 3) { + printf(" gap = %u \n", gap); + for (int k = 0; k < N; ++k) + datain[k] = k * gap; + uint32_t offset = 0; + for (int k = 0; k * SIMDBlockSize < N; ++k) { + ///////////////////////////// + // First part works for general arrays (sorted or unsorted) + ///////////////////////////// + // we compute the bit width + const uint32_t b = maxbits(datain + k * SIMDBlockSize); + // we read 128 integers at "datain + k * SIMDBlockSize" and + // write b 128-bit vectors at "buffer" + simdpackwithoutmask(datain + k * SIMDBlockSize, buffer, b); + // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer + simdunpack(buffer, backbuffer, b);//uncompressed + for (int j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack\n"); + return -2; + } + } + ///////////////////////////// + // next part assumes that the data is sorted (uses differential coding) + ///////////////////////////// + // we compute the bit width + const uint32_t b1 = simdmaxbitsd1(offset, + datain + k * SIMDBlockSize); + // we read 128 integers at "datain + k * SIMDBlockSize" and + // write b1 128-bit vectors at "buffer" + simdpackwithoutmaskd1(offset, datain + k * SIMDBlockSize, buffer, + b1); + // we read back b1 128-bit vectors at "buffer" and write 128 integers at backbuffer + simdunpackd1(offset, buffer, backbuffer, b1); + for (int j = 0; j < SIMDBlockSize; ++j) { + if (backbuffer[j] != datain[k * SIMDBlockSize + j]) { + printf("bug in simdpack d1\n"); + return -3; + } + } + offset = datain[k * SIMDBlockSize + SIMDBlockSize - 1]; + + } + } + free(buffer); + free(datain); + free(backbuffer); + printf("Code looks good.\n"); + return 0; +} diff --git a/aux/simple8b.c b/aux/simple8b.c new file mode 100644 index 0000000..0bec2b7 --- /dev/null +++ b/aux/simple8b.c @@ -0,0 +1,333 @@ +// modified and speed optimized 64 bits version from: +// Vo Ngoc Anh, Alistair Moffat: Index compression using 64-bit words. +// Softw., Pract. Exper. 40(2): 131-147 (2010) +// http://ww2.cs.mu.oz.au/~alistair/coders-64bit/ + + #if defined(__x86_64__) || defined(__x86_32__) +static inline int bsr32(int x) { + int b = -1; + asm("bsrl %1,%0" : "+r" (b): "rm" (x) ); + return b + 1; +} + #else +static inline int bsr32(int x) { + return x?32 - __builtin_clz(x):0; +} + #endif + +#define WPUT(__x,__bit) { __bw |= (unsigned long long)(__x)<<__br; __br += __bit; } +#define WPUTZERO(__sel) { __bw = __br = 0; WPUT(__sel,4); } +#define WPUTFLUSH(__out) { *(typeof(__bw) *)__out = __bw; __out += sizeof(__bw)/sizeof(__out[0]); } + +#if 0 //WORD_SIZE==32 + #define CODE_TABLE \ + unsigned char sel2bit[]= { 0, 0, 0, 0, 0, 0, 0, 1 ,2,3,4,5,7,9,14,28}; \ + unsigned sel2elems[]= {256,120,90,60,50,40,32,28,14,9,7,5,4,3, 2, 1}; \ + + #define BIT_2_SEL \ + char bit2sel[]= { 0,7,8,9,10,11,12,12,13,13,14,14,14,14,14, \ + 15,15,15,15,15,15,15,15,15,15,15,15,15,15, \ + -1,-1,-1,-1}; + #define MAX_BIT 28 +#else +#define CODE_TABLE \ + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ \ +unsigned char sel2bit[]= { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 15, 20, 30, 60,61}; \ +unsigned sel2elems[]= {256,120,60,30,20,15,12,10, 8, 7, 6, 5, 4, 3, 2, 1}; \ +unsigned sellim[]= {256,120,60,60,60,60,60,60,56,56, 60, 60, 60, 60, 60, 60}; + +#define BIT_2_SEL char bit2sel[]= \ + {0,2,3,4,5,6,7,8, 9,10,10,11,11,12,12,12, \ + 13,13,13,13,13,14,14,14, 14,14,14,14,14,14,14,15, \ + 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, \ + 15,15,15,15,15,15,15,15, 15,15,15,15,15,-1, -1, -1, -1}; + + #define MAX_BIT 60 +#endif + +CODE_TABLE +BIT_2_SEL + +unsigned char *s8benco(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out) { + unsigned long long __bw; unsigned __br = 0; + unsigned char bits[0x100]; + int elems; + int i,j; + for (i = 0; i < n; i++) { + unsigned xb = in[i]; + bits[i] = bsr32(xb)+1; + } //CalcMinBits(in, bits, n); + int sel, bit,tmp; /*BLK_ENC_ADJUST*/ + for (i=0; i bit) { + tmp = bit2sel[bits[j]] ; + if(elems < sel2elems[ tmp ]) { + sel = tmp; + bit= sel2bit[sel]; + } else { + while ( elems < sel2elems[sel] ) sel++; + elems = sel2elems[sel]; + bit = sel2bit[sel]; + break; + } + } + elems++; + } + if (bit == 0) { /* can be downgrade to bit=1 */ + if (i+elems elems; sel++); + elems = sel2elems[sel]; + bit = sel2bit[sel]; + } else sel = 0; /* what a waste! */ + } else { + sel = bit2sel[bit]; + bit = sel2bit[sel]; + } + WPUTZERO(sel); + if (bit) { + for ( ; elems ; elems--, i++) WPUT(in[i],bit); + } else + i += elems; + WPUTFLUSH(out); + } + return out; +} + +#define MSK(__x) ((1ul<<__x)-1) +unsigned char *s8bdeco(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out) { + unsigned char *ip = in; + unsigned i,*_out = out,*out_ = out+n; + while(out < out_) { + unsigned long long w = *(unsigned long long *)ip; + switch(w & 15) { + #if 0 + case 0: ip+=8; for(i=0; i<256; i++) out[i]= 1; out += 256; break; + #else + case 0: { int r = (w>>4)&0xf; ip++; if(r == 0xf) { r = (w>>8)&0xff; ip++; } while(r-->=0) *out++=0; } break; + #endif + + case 1: ip+=8; + for(i=0; i<120; i++) out[i]= 1; out += 120; + break; + case 2: ip+=8; + out[ 0]= (w >> 4) & MSK(1); + out[ 1]= (w >> 5) & MSK(1); + out[ 2]= (w >> 6) & MSK(1); + out[ 3]= (w >> 7) & MSK(1); + out[ 4]= (w >> 8) & MSK(1); + out[ 5]= (w >> 9) & MSK(1); + out[ 6]= (w >> 10) & MSK(1); + out[ 7]= (w >> 11) & MSK(1); + out[ 8]= (w >> 12) & MSK(1); + out[ 9]= (w >> 13) & MSK(1); + out[10]= (w >> 14) & MSK(1); + out[11]= (w >> 15) & MSK(1); + out[12]= (w >> 16) & MSK(1); + out[13]= (w >> 17) & MSK(1); + out[14]= (w >> 18) & MSK(1); + out[15]= (w >> 19) & MSK(1); + out[16]= (w >> 20) & MSK(1); + out[17]= (w >> 21) & MSK(1); + out[18]= (w >> 22) & MSK(1); + out[19]= (w >> 23) & MSK(1); + out[20]= (w >> 24) & MSK(1); + out[21]= (w >> 25) & MSK(1); + out[22]= (w >> 26) & MSK(1); + out[23]= (w >> 27) & MSK(1); + out[24]= (w >> 28) & MSK(1); + out[25]= (w >> 29) & MSK(1); + out[26]= (w >> 30) & MSK(1); + out[27]= (w >> 31) & MSK(1); + out[28]= (w >> 32) & MSK(1); + out[29]= (w >> 33) & MSK(1); + out[30]= (w >> 34) & MSK(1); + out[31]= (w >> 35) & MSK(1); + out[32]= (w >> 36) & MSK(1); + out[33]= (w >> 37) & MSK(1); + out[34]= (w >> 38) & MSK(1); + out[35]= (w >> 39) & MSK(1); + out[36]= (w >> 40) & MSK(1); + out[37]= (w >> 41) & MSK(1); + out[38]= (w >> 42) & MSK(1); + out[39]= (w >> 43) & MSK(1); + out[40]= (w >> 44) & MSK(1); + out[41]= (w >> 45) & MSK(1); + out[42]= (w >> 46) & MSK(1); + out[43]= (w >> 47) & MSK(1); + out[44]= (w >> 48) & MSK(1); + out[45]= (w >> 49) & MSK(1); + out[46]= (w >> 50) & MSK(1); + out[47]= (w >> 51) & MSK(1); + out[48]= (w >> 52) & MSK(1); + out[49]= (w >> 53) & MSK(1); + out[50]= (w >> 54) & MSK(1); + out[51]= (w >> 55) & MSK(1); + out[52]= (w >> 56) & MSK(1); + out[53]= (w >> 57) & MSK(1); + out[54]= (w >> 58) & MSK(1); + out[55]= (w >> 59) & MSK(1); + out[56]= (w >> 60) & MSK(1); + out[57]= (w >> 61) & MSK(1); + out[58]= (w >> 62) & MSK(1); + out[59]= (w >> 63) & MSK(1); out += 60; + break; + case 3: ip+=8; + out[ 0]= (w >> 4) & MSK(2); + out[ 1]= (w >> 6) & MSK(2); + out[ 2]= (w >> 8) & MSK(2); + out[ 3]= (w >> 10) & MSK(2); + out[ 4]= (w >> 12) & MSK(2); + out[ 5]= (w >> 14) & MSK(2); + out[ 6]= (w >> 16) & MSK(2); + out[ 7]= (w >> 18) & MSK(2); + out[ 8]= (w >> 20) & MSK(2); + out[ 9]= (w >> 22) & MSK(2); + out[10]= (w >> 24) & MSK(2); + out[11]= (w >> 26) & MSK(2); + out[12]= (w >> 28) & MSK(2); + out[13]= (w >> 30) & MSK(2); + out[14]= (w >> 32) & MSK(2); + out[15]= (w >> 34) & MSK(2); + out[16]= (w >> 36) & MSK(2); + out[17]= (w >> 38) & MSK(2); + out[18]= (w >> 40) & MSK(2); + out[19]= (w >> 42) & MSK(2); + out[20]= (w >> 44) & MSK(2); + out[21]= (w >> 46) & MSK(2); + out[22]= (w >> 48) & MSK(2); + out[23]= (w >> 50) & MSK(2); + out[24]= (w >> 52) & MSK(2); + out[25]= (w >> 54) & MSK(2); + out[26]= (w >> 56) & MSK(2); + out[27]= (w >> 58) & MSK(2); + out[28]= (w >> 60) & MSK(2); + out[29]= (w >> 62) & MSK(2); out += 30; + break; + case 4: ip+=8; + out[ 0]= (w >> 4) & MSK(3); + out[ 1]= (w >> 7) & MSK(3); + out[ 2]= (w >> 10) & MSK(3); + out[ 3]= (w >> 13) & MSK(3); + out[ 4]= (w >> 16) & MSK(3); + out[ 5]= (w >> 19) & MSK(3); + out[ 6]= (w >> 22) & MSK(3); + out[ 7]= (w >> 25) & MSK(3); + out[ 8]= (w >> 28) & MSK(3); + out[ 9]= (w >> 31) & MSK(3); + out[10]= (w >> 34) & MSK(3); + out[11]= (w >> 37) & MSK(3); + out[12]= (w >> 40) & MSK(3); + out[13]= (w >> 43) & MSK(3); + out[14]= (w >> 46) & MSK(3); + out[15]= (w >> 49) & MSK(3); + out[16]= (w >> 52) & MSK(3); + out[17]= (w >> 55) & MSK(3); + out[18]= (w >> 58) & MSK(3); + out[19]= (w >> 61) & MSK(3); out += 20; + break; + case 5: ip+=8; + out[ 0]= (w >> 4) & MSK(4); + out[ 1]= (w >> 8) & MSK(4); + out[ 2]= (w >> 12) & MSK(4); + out[ 3]= (w >> 16) & MSK(4); + out[ 4]= (w >> 20) & MSK(4); + out[ 5]= (w >> 24) & MSK(4); + out[ 6]= (w >> 28) & MSK(4); + out[ 7]= (w >> 32) & MSK(4); + out[ 8]= (w >> 36) & MSK(4); + out[ 9]= (w >> 40) & MSK(4); + out[10]= (w >> 44) & MSK(4); + out[11]= (w >> 48) & MSK(4); + out[12]= (w >> 52) & MSK(4); + out[13]= (w >> 56) & MSK(4); + out[14]= (w >> 60) & MSK(4); out += 15; + break; + case 6: ip+=8; + out[ 0]= (w >> 4) & MSK(5); + out[ 1]= (w >> 9) & MSK(5); + out[ 2]= (w >> 14) & MSK(5); + out[ 3]= (w >> 19) & MSK(5); + out[ 4]= (w >> 24) & MSK(5); + out[ 5]= (w >> 29) & MSK(5); + out[ 6]= (w >> 34) & MSK(5); + out[ 7]= (w >> 39) & MSK(5); + out[ 8]= (w >> 44) & MSK(5); + out[ 9]= (w >> 49) & MSK(5); + out[10]= (w >> 54) & MSK(5); + out[11]= (w >> 59) & MSK(5); out += 12; + break; + case 7: ip+=8; + out[0]= (w >> 4) & MSK(6); + out[1]= (w >> 10) & MSK(6); + out[2]= (w >> 16) & MSK(6); + out[3]= (w >> 22) & MSK(6); + out[4]= (w >> 28) & MSK(6); + out[5]= (w >> 34) & MSK(6); + out[6]= (w >> 40) & MSK(6); + out[7]= (w >> 46) & MSK(6); + out[8]= (w >> 52) & MSK(6); + out[9]= (w >> 58) & MSK(6); out += 10; + break; + case 8: ip+=8; + out[0]= (w >> 4 ) & MSK(7); + out[1]= (w >> 11) & MSK(7); + out[2]= (w >> 18) & MSK(7); + out[3]= (w >> 25) & MSK(7); + out[4]= (w >> 32) & MSK(7); + out[5]= (w >> 39) & MSK(7); + out[6]= (w >> 46) & MSK(7); + out[7]= (w >> 53) & MSK(7); out += 8; + break; + case 9: ip+=8; + out[0]= (w >> 4 ) & MSK(8); + out[1]= (w >> 12) & MSK(8); + out[2]= (w >> 20) & MSK(8); + out[3]= (w >> 28) & MSK(8); + out[4]= (w >> 36) & MSK(8); + out[5]= (w >> 44) & MSK(8); + out[6]= (w >> 52) & MSK(8); out += 7; + break; + case 10: ip+=8; + out[0]= (w >> 4) & MSK(10); + out[1]= (w >> 14) & MSK(10); + out[2]= (w >> 24) & MSK(10); + out[3]= (w >> 34) & MSK(10); + out[4]= (w >> 44) & MSK(10); + out[5]= (w >> 54) & MSK(10); out += 6; + break; + case 11: ip+=8; + out[0]= (w >> 4) & MSK(12); + out[1]= (w >> 16) & MSK(12); + out[2]= (w >> 28) & MSK(12); + out[3]= (w >> 40) & MSK(12); + out[4]= (w >> 52) & MSK(12); out += 5; + break; + case 12: ip+=8; + out[0]= (w >> 4) & MSK(15); + out[1]= (w >> 19) & MSK(15); + out[2]= (w >> 34) & MSK(15); + out[3]= (w >> 49) & MSK(15); out += 4; + break; + case 13: ip+=8; + out[0]= (w >> 4) & MSK(20); + out[1]= (w >> 24) & MSK(20); + out[2]= (w >> 44) & MSK(20); out += 3; + break; + case 14: ip+=8; + out[0]= (w >> 4) & MSK(30); + out[1]= (w >> 34) & MSK(30); out += 2; + break; + case 15: ip+=8; + out[0]= (w >> 4) & ((1ull<<60)-1); out += 1; + break; + } + } + return ip; +} + diff --git a/aux/simple8b.h b/aux/simple8b.h new file mode 100644 index 0000000..8772124 --- /dev/null +++ b/aux/simple8b.h @@ -0,0 +1,2 @@ +unsigned char *s8benco(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *s8bdeco(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); diff --git a/aux/simple8b.o b/aux/simple8b.o new file mode 100644 index 0000000..7de646c Binary files /dev/null and b/aux/simple8b.o differ diff --git a/aux/vabyte.h b/aux/vabyte.h new file mode 100644 index 0000000..627318b --- /dev/null +++ b/aux/vabyte.h @@ -0,0 +1,99 @@ +// "variablebyte.h" C Version from https://github.com/lemire/FastPFor +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://lemire.me/en/ + */ + +#define extract7bits(i, val) (val >> (7 * i)) & ((1U << 7) - 1) +#define extract7bitsmaskless(i, val) (val >> (7 * i)) + +unsigned char *vbyteenc(unsigned *in, const size_t length, unsigned *out/*, + size_t &nvalue*/) { + unsigned char *bout = (unsigned char *)(out); + //const unsigned char *const initbout = (unsigned char *)(out); + //unsigned prev = 0; + size_t k; + for (k = 0; k < length; ++k) { + const unsigned val = /*delta ? in[k] - prev :*/ in[k]; + //if (delta) prev = in[k]; + /** + * Code below could be shorter. Whether it could be faster + * depends on your compiler and machine. + */ + if (val < (1U << 7)) { + *bout = (unsigned char)(val | (1U << 7)); + ++bout; + } else if (val < (1U << 14)) { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bitsmaskless(1,val) | (1U << 7); + ++bout; + } else if (val < (1U << 21)) { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bits(1,val); + ++bout; + *bout = extract7bitsmaskless(2,val) | (1U << 7); + ++bout; + } else if (val < (1U << 28)) { + *bout = extract7bits(0, val); + ++bout; + *bout = extract7bits(1, val); + ++bout; + *bout = extract7bits(2, val); + ++bout; + *bout = extract7bitsmaskless(3, val) | (1U << 7); + ++bout; + } else { + *bout = extract7bits(0,val); + ++bout; + *bout = extract7bits(1,val); + ++bout; + *bout = extract7bits(2,val); + ++bout; + *bout = extract7bits(3,val); + ++bout; + *bout = extract7bitsmaskless(4,val) | (1U << 7); + ++bout; + } + } + /*while (needPaddingTo32Bits(bout)) { + *bout++ = 0; + } + const size_t storageinbytes = bout - initbout; + assert((storageinbytes % 4) == 0); + nvalue = storageinbytes / 4;*/ + return bout; +} + + +unsigned char *vbytedec(const unsigned *in, const size_t length, + unsigned *out/*, size_t &nvalue*/) { + unsigned prev = 0; + if (length == 0) { + //nvalue = 0; + return in;//abort + } + const unsigned char *inbyte = (const unsigned char *)(in); + const unsigned char *const endbyte = (const unsigned char *)(out + + length); + //const unsigned *const initout(out); + + while (endbyte > out) { + unsigned int shift = 0; unsigned v; + for (v = 0; endbyte > out; shift += 7) { + unsigned char c = *inbyte++; + v += ((c & 127) << shift); + if ((c & 128)) { + *out++ = /*delta ? (prev = v + prev) :*/ v; + break; + } + } + } + //nvalue = out - initout; + //inbyte = padTo32bits(inbyte); + return (const unsigned *)(inbyte); + } + diff --git a/aux/varintg8iu.c b/aux/varintg8iu.c new file mode 100644 index 0000000..29ebfa4 --- /dev/null +++ b/aux/varintg8iu.c @@ -0,0 +1,181 @@ +// C Version of "VarIntG8IU.h" from https://github.com/lemire/FastPFor +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + */ +/** + * + * Implementation of varint-G8IU taken from + * Stepanov et al., SIMD-Based Decoding of Posting Lists, CIKM 2011 + * + * Update: D. Lemire believes that this scheme was patented by Rose, Stepanov et al. (patent 20120221539). + * We wrote this code before the patent was published (August 2012). + * + * By Maxime Caron + * From + * https://github.com/maximecaron/SIMD-Based-Posting-lists + * with minor modifications by D. Lemire. + */ +#ifndef __SSSE3__ +#pragma message "Disabling varintg8iu due to lack of SSSE3 support, try adding -mssse3" +#else +#ifndef VARINTG8IU_H__ +#define VARINTG8IU_H__ +#include +//#include "codecs.h" +#ifdef __GNUC__ +#define PREDICT_FALSE(x) (__builtin_expect(x, 0)) +#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) +#else +#define PREDICT_FALSE(x) x +#define PREDICT_TRUE(x) x +#endif +#include "varintg8iu.h" + +typedef char v16qi __attribute__ ((vector_size (16))); + +static int maskOutputSize[256]; +static char mask[256][32]; + + int getNumByteNeeded(const uint32_t value) { + if (value > 0x000000FF) { + if (value > 0x0000FFFF) { + if (value > 0x00FFFFFF) { + return 4; + } else { + return 3; + } + } else { + return 2; + } + } else { + return 1; + } + } + + + // For all possible values of the + // descriptor we build a table of any shuffle sequence + // that might be needed at decode time. +void VarIntG8IU() { + for (int desc = 0; desc <= 255; desc++) { + int bitmask = 0x00000001; + int bitindex = 0; + // count number of 0 in the char + int complete = 0; + int ithSize[8]; + int lastpos = -1; + while (bitindex < 8) { + if ((desc & bitmask) == 0) { + ithSize[complete] = bitindex - lastpos; + lastpos = bitindex; + complete++; + } + bitindex++; + bitmask = bitmask << 1; + } + maskOutputSize[desc] = complete; + + int j = 0; + int k = 0; + for (int i = 0; i < complete; i++) { + for (int n = 0; n < 4; n++) { + if (n < ithSize[i]) { + mask[desc][k] = j; + j = j + 1; + } else { + mask[desc][k] = -1; + } + k = k + 1; + } + } + + } + + } + +unsigned char *vintg8enc(const uint32_t *__restrict__ in, const size_t length, unsigned char *__restrict__ out) { + const uint32_t *in_ = in + length; //size_t srclength = length * 4;unsigned char* dest = (unsigned char*)(out);size_t dstlength = length * 4; + //size_t compressed_size = 0; + while(in < in_ /*srclength > 0 && dstlength >= 9*/) { //compressed_size += encodeBlock(in, srclength, dst, nvalue); + unsigned char desc = 0xFF; + unsigned char bitmask = 0x01; + uint32_t buffer[8]; + int ithSize[8]; + int length = 0; + int numInt = 0; + + while (in < in_ /*srclength > 0*/) { + const uint32_t* temp = in; + int byteNeeded = getNumByteNeeded(*temp); + + if (PREDICT_FALSE(length + byteNeeded > 8)) { + break; + } + + //flip the correct bit in desc + bitmask = bitmask << (byteNeeded - 1); + desc = desc ^ bitmask; + bitmask = bitmask << 1; + + ithSize[numInt] = byteNeeded; + length += byteNeeded; + buffer[numInt] = *temp; + ++in;// = in + 1; + //srclength -= 4; + numInt++; + } + out[0] = desc; + int written = 1; + for(int i = 0; i < numInt; i++) { + int size = ithSize[i]; + uint32_t value = buffer[i]; + for (int j = 0; j < size; j++) { + out[written++] = value >> (j * 8); + } + } + out += 9; //dstlength -= 9; //compressed_size += 9; + } + //Ouput might not be a multiple of 4 so we make it so + return out; //out + ((compressed_size + 3)/ 4); + } + +unsigned char *vintg8dec(const unsigned char *__restrict__ in, const size_t length, uint32_t *__restrict__ out) { + size_t srclength = length * 4; + const unsigned *out_ = out + length; //uint32_t * dest = out;size_t nvalue = length * 4; //uint32_t uncompressSize = 0; + while (out < out_ /*srclength >= 9*/) { //uncompressSize += decodeBlock(in, srclength, dst/*, nvalue*/); + const unsigned char* pdesc = in++; + unsigned char desc = *pdesc; + srclength -= 1; + + const unsigned char* peek = in; + v16qi data; + if (PREDICT_TRUE(srclength >= 16)) { + // read 16 byte of data only if we need to avoid cache miss + data = __builtin_ia32_lddqu((const char*) (peek)); + } else { + static char buff[16]; + memcpy(buff, peek, 8); + data = __builtin_ia32_lddqu(buff); + } + // load de required mask + v16qi shf = __builtin_ia32_lddqu(mask[desc]); + v16qi result = __builtin_ia32_pshufb128(data, shf); + char* dst = (char*) (out); + __builtin_ia32_storedqu(dst, result); + int readSize = maskOutputSize[desc]; + + if (PREDICT_TRUE( readSize >= 4)) { + v16qi shf2 = __builtin_ia32_lddqu(mask[desc] + 16); + v16qi result2 = __builtin_ia32_pshufb128(data, shf2); + __builtin_ia32_storedqu(dst + (16), result2); + } + // pop 8 input char + in += 8; srclength -= 8; out += readSize; //dstlength -= readSize * 4;// uncompressSize += readSize; + } + return in; //(uint32_t *) (((uintptr_t) (src) + 3) & ~3); + +} + +#endif //__SSE3__ +#endif diff --git a/aux/varintg8iu.h b/aux/varintg8iu.h new file mode 100644 index 0000000..a2659d4 --- /dev/null +++ b/aux/varintg8iu.h @@ -0,0 +1,5 @@ +#include +void VarIntG8IU(); +unsigned char *vintg8enc(const uint32_t *__restrict__ in, const size_t length, unsigned char *__restrict__ out); +unsigned char *vintg8dec(const unsigned char *__restrict__ in, const size_t length, uint32_t *__restrict__ out); + diff --git a/aux/vas16c.h b/aux/vas16c.h new file mode 100644 index 0000000..15e671d --- /dev/null +++ b/aux/vas16c.h @@ -0,0 +1,35 @@ +// optimized version from: http://jinruhe.com/ +static int s16_cnum[16] = {28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1}; +static int s16_cbits[16][28] = { + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + {2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,2,2,2,2,2,2,2,1,1,1,1,1,1,1,0,0,0,0,0,0,0}, + {1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,0,0,0,0,0,0,0}, + {2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {3,4,4,4,4,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,5,5,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {4,4,5,5,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {6,6,6,5,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {5,5,6,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {7,7,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {10,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {14,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, + {28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} }; + +#define S16ENC(__w, __p, m) { unsigned *_p = __p, *_w = __w; \ + unsigned int _k, _j, _m, _o; \ + for (_k = 0; _k < 16; _k++) { \ + *_w = _k<<28; \ + _m = (s16_cnum[_k] < m)? s16_cnum[_k]:m; \ + for (_j = 0, _o = 0; (_j < _m) && (*(_p+_j) < (1<>28) {\ + case 0:\ + _p[ 0] = (_rw ) & 1;\ + _p[ 1] = (_rw>> 1) & 1;\ + _p[ 2] = (_rw>> 2) & 1;\ + _p[ 3] = (_rw>> 3) & 1;\ + _p[ 4] = (_rw>> 4) & 1;\ + _p[ 5] = (_rw>> 5) & 1;\ + _p[ 6] = (_rw>> 6) & 1;\ + _p[ 7] = (_rw>> 7) & 1;\ + _p[ 8] = (_rw>> 8) & 1;\ + _p[ 9] = (_rw>> 9) & 1;\ + _p[10] = (_rw>>10) & 1;\ + _p[11] = (_rw>>11) & 1;\ + _p[12] = (_rw>>12) & 1;\ + _p[13] = (_rw>>13) & 1;\ + _p[14] = (_rw>>14) & 1;\ + _p[15] = (_rw>>15) & 1;\ + _p[16] = (_rw>>16) & 1;\ + _p[17] = (_rw>>17) & 1;\ + _p[18] = (_rw>>18) & 1;\ + _p[19] = (_rw>>19) & 1;\ + _p[20] = (_rw>>20) & 1;\ + _p[21] = (_rw>>21) & 1;\ + _p[22] = (_rw>>22) & 1;\ + _p[23] = (_rw>>23) & 1;\ + _p[24] = (_rw>>24) & 1;\ + _p[25] = (_rw>>25) & 1;\ + _p[26] = (_rw>>26) & 1;\ + _p[27] = (_rw>>27) & 1; _p += 28;\ + break;\ + case 1: \ + _p[ 0] = (_rw ) & 3;\ + _p[ 1] = (_rw>> 2) & 3;\ + _p[ 2] = (_rw>> 4) & 3;\ + _p[ 3] = (_rw>> 6) & 3;\ + _p[ 4] = (_rw>> 8) & 3;\ + _p[ 5] = (_rw>>10) & 3;\ + _p[ 6] = (_rw>>12) & 3;\ + _p[ 7] = (_rw>>14) & 1;\ + _p[ 8] = (_rw>>15) & 1;\ + _p[ 9] = (_rw>>16) & 1;\ + _p[10] = (_rw>>17) & 1;\ + _p[11] = (_rw>>18) & 1;\ + _p[12] = (_rw>>19) & 1;\ + _p[13] = (_rw>>20) & 1;\ + _p[14] = (_rw>>21) & 1;\ + _p[15] = (_rw>>22) & 1;\ + _p[16] = (_rw>>23) & 1;\ + _p[17] = (_rw>>24) & 1;\ + _p[18] = (_rw>>25) & 1;\ + _p[19] = (_rw>>26) & 1;\ + _p[20] = (_rw>>27) & 1; _p += 21; \ + break; \ + case 2: \ + _p[0] = (_rw) & 1; \ + _p[1] = (_rw>>1) & 1;\ + _p[2] = (_rw>>2) & 1;\ + _p[3] = (_rw>>3) & 1;\ + _p[4] = (_rw>>4) & 1;\ + _p[5] = (_rw>>5) & 1;\ + _p[6] = (_rw>>6) & 1;\ + _p[7] = (_rw>>7) & 3;\ + _p[8] = (_rw>>9) & 3;\ + _p[9] = (_rw>>11) & 3;\ + _p[10] = (_rw>>13) & 3;\ + _p[11] = (_rw>>15) & 3;\ + _p[12] = (_rw>>17) & 3;\ + _p[13] = (_rw>>19) & 3;\ + _p[14] = (_rw>>21) & 1;\ + _p[15] = (_rw>>22) & 1;\ + _p[16] = (_rw>>23) & 1;\ + _p[17] = (_rw>>24) & 1;\ + _p[18] = (_rw>>25) & 1;\ + _p[19] = (_rw>>26) & 1;\ + _p[20] = (_rw>>27) & 1; _p += 21;\ + break; \ + case 3: \ + _p[0] = (_rw) & 1; \ + _p[1] = (_rw>>1) & 1;\ + _p[2] = (_rw>>2) & 1;\ + _p[3] = (_rw>>3) & 1;\ + _p[4] = (_rw>>4) & 1;\ + _p[5] = (_rw>>5) & 1;\ + _p[6] = (_rw>>6) & 1;\ + _p[7] = (_rw>>7) & 1;\ + _p[8] = (_rw>>8) & 1;\ + _p[9] = (_rw>>9) & 1;\ + _p[10] = (_rw>>10) & 1;\ + _p[11] = (_rw>>11) & 1;\ + _p[12] = (_rw>>12) & 1;\ + _p[13] = (_rw>>13) & 1;\ + _p[14] = (_rw>>14) & 3;\ + _p[15] = (_rw>>16) & 3;\ + _p[16] = (_rw>>18) & 3;\ + _p[17] = (_rw>>20) & 3;\ + _p[18] = (_rw>>22) & 3;\ + _p[19] = (_rw>>24) & 3;\ + _p[20] = (_rw>>26) & 3; _p += 21;\ + break; \ + case 4: \ + _p[ 0] = (_rw ) & 3;\ + _p[ 1] = (_rw>> 2) & 3;\ + _p[ 2] = (_rw>> 4) & 3;\ + _p[ 3] = (_rw>> 6) & 3;\ + _p[ 4] = (_rw>> 8) & 3;\ + _p[ 5] = (_rw>>10) & 3;\ + _p[ 6] = (_rw>>12) & 3;\ + _p[ 7] = (_rw>>14) & 3;\ + _p[ 8] = (_rw>>16) & 3;\ + _p[ 9] = (_rw>>18) & 3;\ + _p[10] = (_rw>>20) & 3;\ + _p[11] = (_rw>>22) & 3;\ + _p[12] = (_rw>>24) & 3;\ + _p[13] = (_rw>>26) & 3; _p += 14;\ + break; \ + case 5: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 7;\ + _p[2] = (_rw>>7) & 7;\ + _p[3] = (_rw>>10) & 7;\ + _p[4] = (_rw>>13) & 7;\ + _p[5] = (_rw>>16) & 7;\ + _p[6] = (_rw>>19) & 7;\ + _p[7] = (_rw>>22) & 7;\ + _p[8] = (_rw>>25) & 7; _p += 9;\ + break; \ + case 6: \ + _p[0] = (_rw) & 7; \ + _p[1] = (_rw>>3) & 15;\ + _p[2] = (_rw>>7) & 15;\ + _p[3] = (_rw>>11) & 15;\ + _p[4] = (_rw>>15) & 15;\ + _p[5] = (_rw>>19) & 7;\ + _p[6] = (_rw>>22) & 7;\ + _p[7] = (_rw>>25) & 7; _p += 8;\ + break; \ + case 7: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 15;\ + _p[2] = (_rw>>8) & 15;\ + _p[3] = (_rw>>12) & 15;\ + _p[4] = (_rw>>16) & 15;\ + _p[5] = (_rw>>20) & 15;\ + _p[6] = (_rw>>24) & 15; _p += 7;\ + break; \ + case 8: \ + _p[0] = (_rw ) & 31;\ + _p[1] = (_rw>> 5) & 31;\ + _p[2] = (_rw>>10) & 31;\ + _p[3] = (_rw>>15) & 31;\ + _p[4] = (_rw>>20) & 15;\ + _p[5] = (_rw>>24) & 15; _p += 6;\ + break; \ + case 9: \ + _p[0] = (_rw) & 15; \ + _p[1] = (_rw>>4) & 15;\ + _p[2] = (_rw>>8) & 31;\ + _p[3] = (_rw>>13) & 31;\ + _p[4] = (_rw>>18) & 31;\ + _p[5] = (_rw>>23) & 31; _p += 6;\ + break; \ + case 10: \ + _p[0] = (_rw) & 63; \ + _p[1] = (_rw>>6) & 63;\ + _p[2] = (_rw>>12) & 63;\ + _p[3] = (_rw>>18) & 31;\ + _p[4] = (_rw>>23) & 31; _p += 5;\ + break; \ + case 11: \ + _p[0] = (_rw) & 31; \ + _p[1] = (_rw>>5) & 31;\ + _p[2] = (_rw>>10) & 63;\ + _p[3] = (_rw>>16) & 63;\ + _p[4] = (_rw>>22) & 63; _p += 5;\ + break; \ + case 12: \ + _p[0] = (_rw) & 127; \ + _p[1] = (_rw>>7) & 127;\ + _p[2] = (_rw>>14) & 127;\ + _p[3] = (_rw>>21) & 127; _p += 4;\ + break; \ + case 13: \ + _p[0] = (_rw) & 1023; \ + _p[1] = (_rw>>10) & 511;\ + _p[2] = (_rw>>19) & 511; _p += 3;\ + break; \ + case 14: \ + _p[0] = (_rw) & 16383; \ + _p[1] = (_rw>>14) & 16383; _p += 2;\ + break; \ + case 15: \ + _p[0] = (_rw) & ((1<<28)-1); _p++; \ + break; \ + } \ +} + +#if 0 +#define BREAK _rw = *_in++; goto *_lab[__out<_oute?((_rw)>>28):16] + +#define s16dec(__in, __n, __pout) ({\ + __label__ _lab0,_lab1,_lab2,_lab3,_lab4,_lab5,_lab6,_lab7,_lab8,_lab9,_lab10,_lab11,_lab12,_lab13,_lab14,_lab15,_labend;\ + static void *_lab[] = { &&_lab0, &&_lab1, &&_lab2, &&_lab3, &&_lab4, &&_lab5, &&_lab6, &&_lab7, &&_lab8, &&_lab9, &&_lab10, &&_lab11, &&_lab12, &&_lab13, &&_lab14, &&_lab15, &&_labend };\ + unsigned *_in = __in; typeof(__pout[0]) *__out = __pout, *_oute = __out+(__n); register unsigned _rw = *_in++; goto *_lab[(_rw)>>28];\ + _lab0:\ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 1; \ + __out[8] = (_rw>>8) & 1; \ + __out[9] = (_rw>>9) & 1; \ + __out[10] = (_rw>>10) & 1; \ + __out[11] = (_rw>>11) & 1; \ + __out[12] = (_rw>>12) & 1; \ + __out[13] = (_rw>>13) & 1; \ + __out[14] = (_rw>>14) & 1; \ + __out[15] = (_rw>>15) & 1; \ + __out[16] = (_rw>>16) & 1; \ + __out[17] = (_rw>>17) & 1; \ + __out[18] = (_rw>>18) & 1; \ + __out[19] = (_rw>>19) & 1; \ + __out[20] = (_rw>>20) & 1; \ + __out[21] = (_rw>>21) & 1; \ + __out[22] = (_rw>>22) & 1; \ + __out[23] = (_rw>>23) & 1; \ + __out[24] = (_rw>>24) & 1; \ + __out[25] = (_rw>>25) & 1; \ + __out[26] = (_rw>>26) & 1; \ + __out[27] = (_rw>>27) & 1; __out += 28;\ + BREAK; \ + _lab1: \ + __out[0] = (_rw) & 3; \ + __out[1] = (_rw>>2) & 3; \ + __out[2] = (_rw>>4) & 3; \ + __out[3] = (_rw>>6) & 3; \ + __out[4] = (_rw>>8) & 3; \ + __out[5] = (_rw>>10) & 3; \ + __out[6] = (_rw>>12) & 3; \ + __out[7] = (_rw>>14) & 1; \ + __out[8] = (_rw>>15) & 1; \ + __out[9] = (_rw>>16) & 1; \ + __out[10] = (_rw>>17) & 1; \ + __out[11] = (_rw>>18) & 1; \ + __out[12] = (_rw>>19) & 1; \ + __out[13] = (_rw>>20) & 1; \ + __out[14] = (_rw>>21) & 1; \ + __out[15] = (_rw>>22) & 1; \ + __out[16] = (_rw>>23) & 1; \ + __out[17] = (_rw>>24) & 1; \ + __out[18] = (_rw>>25) & 1; \ + __out[19] = (_rw>>26) & 1; \ + __out[20] = (_rw>>27) & 1; __out += 21; \ + BREAK; \ + _lab2: \ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 3; \ + __out[8] = (_rw>>9) & 3; \ + __out[9] = (_rw>>11) & 3; \ + __out[10] = (_rw>>13) & 3; \ + __out[11] = (_rw>>15) & 3; \ + __out[12] = (_rw>>17) & 3; \ + __out[13] = (_rw>>19) & 3; \ + __out[14] = (_rw>>21) & 1; \ + __out[15] = (_rw>>22) & 1; \ + __out[16] = (_rw>>23) & 1; \ + __out[17] = (_rw>>24) & 1; \ + __out[18] = (_rw>>25) & 1; \ + __out[19] = (_rw>>26) & 1; \ + __out[20] = (_rw>>27) & 1; __out += 21;\ + BREAK; \ + _lab3: \ + __out[0] = (_rw) & 1; \ + __out[1] = (_rw>>1) & 1; \ + __out[2] = (_rw>>2) & 1; \ + __out[3] = (_rw>>3) & 1; \ + __out[4] = (_rw>>4) & 1; \ + __out[5] = (_rw>>5) & 1; \ + __out[6] = (_rw>>6) & 1; \ + __out[7] = (_rw>>7) & 1; \ + __out[8] = (_rw>>8) & 1; \ + __out[9] = (_rw>>9) & 1; \ + __out[10] = (_rw>>10) & 1; \ + __out[11] = (_rw>>11) & 1; \ + __out[12] = (_rw>>12) & 1; \ + __out[13] = (_rw>>13) & 1; \ + __out[14] = (_rw>>14) & 3; \ + __out[15] = (_rw>>16) & 3; \ + __out[16] = (_rw>>18) & 3; \ + __out[17] = (_rw>>20) & 3; \ + __out[18] = (_rw>>22) & 3; \ + __out[19] = (_rw>>24) & 3; \ + __out[20] = (_rw>>26) & 3; __out += 21;\ + BREAK; \ + _lab4: \ + __out[0] = (_rw) & 3; \ + __out[1] = (_rw>>2) & 3; \ + __out[2] = (_rw>>4) & 3; \ + __out[3] = (_rw>>6) & 3; \ + __out[4] = (_rw>>8) & 3; \ + __out[5] = (_rw>>10) & 3; \ + __out[6] = (_rw>>12) & 3; \ + __out[7] = (_rw>>14) & 3; \ + __out[8] = (_rw>>16) & 3; \ + __out[9] = (_rw>>18) & 3; \ + __out[10] = (_rw>>20) & 3; \ + __out[11] = (_rw>>22) & 3; \ + __out[12] = (_rw>>24) & 3; \ + __out[13] = (_rw>>26) & 3; __out += 14;\ + BREAK; \ + _lab5: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 7; \ + __out[2] = (_rw>>7) & 7; \ + __out[3] = (_rw>>10) & 7; \ + __out[4] = (_rw>>13) & 7; \ + __out[5] = (_rw>>16) & 7; \ + __out[6] = (_rw>>19) & 7; \ + __out[7] = (_rw>>22) & 7; \ + __out[8] = (_rw>>25) & 7; __out += 9;\ + BREAK; \ + _lab6: \ + __out[0] = (_rw) & 7; \ + __out[1] = (_rw>>3) & 15; \ + __out[2] = (_rw>>7) & 15; \ + __out[3] = (_rw>>11) & 15; \ + __out[4] = (_rw>>15) & 15; \ + __out[5] = (_rw>>19) & 7; \ + __out[6] = (_rw>>22) & 7; \ + __out[7] = (_rw>>25) & 7; __out += 8;\ + BREAK; \ + _lab7: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 15; \ + __out[2] = (_rw>>8) & 15; \ + __out[3] = (_rw>>12) & 15; \ + __out[4] = (_rw>>16) & 15; \ + __out[5] = (_rw>>20) & 15; \ + __out[6] = (_rw>>24) & 15; __out += 7;\ + BREAK; \ + _lab8: \ + __out[0] = (_rw) & 31; \ + __out[1] = (_rw>>5) & 31; \ + __out[2] = (_rw>>10) & 31; \ + __out[3] = (_rw>>15) & 31; \ + __out[4] = (_rw>>20) & 15; \ + __out[5] = (_rw>>24) & 15; __out += 6;\ + BREAK; \ + _lab9: \ + __out[0] = (_rw) & 15; \ + __out[1] = (_rw>>4) & 15; \ + __out[2] = (_rw>>8) & 31; \ + __out[3] = (_rw>>13) & 31; \ + __out[4] = (_rw>>18) & 31; \ + __out[5] = (_rw>>23) & 31; __out += 6;\ + BREAK; \ + _lab10: \ + __out[0] = (_rw) & 63; \ + __out[1] = (_rw>>6) & 63; \ + __out[2] = (_rw>>12) & 63; \ + __out[3] = (_rw>>18) & 31; \ + __out[4] = (_rw>>23) & 31; __out += 5;\ + BREAK; \ + _lab11: \ + __out[0] = (_rw) & 31; \ + __out[1] = (_rw>>5) & 31; \ + __out[2] = (_rw>>10) & 63; \ + __out[3] = (_rw>>16) & 63; \ + __out[4] = (_rw>>22) & 63; __out += 5;\ + BREAK; \ + _lab12: \ + __out[0] = (_rw) & 127; \ + __out[1] = (_rw>>7) & 127; \ + __out[2] = (_rw>>14) & 127; \ + __out[3] = (_rw>>21) & 127; __out += 4;\ + BREAK; \ + _lab13: \ + __out[0] = (_rw) & 1023; \ + __out[1] = (_rw>>10) & 511; \ + __out[2] = (_rw>>19) & 511; __out += 3;\ + BREAK; \ + _lab14:\ + __out[0] = (_rw) & 16383; \ + __out[1] = (_rw>>14) & 16383; __out += 2;\ + BREAK; \ + _lab15:\ + __out[0] = (_rw) & ((1<<28)-1); __out++; \ + BREAK;\ + _labend:;(_in-1);\ +}) +#endif diff --git a/aux/vbyte_poly.h b/aux/vbyte_poly.h new file mode 100644 index 0000000..3c2668d --- /dev/null +++ b/aux/vbyte_poly.h @@ -0,0 +1,46 @@ +// +#define VBYTE_ENC(_v, _n) \ +{\ + unsigned _num; \ + unsigned char _barray[5]; \ + unsigned _i, _started = 0; \ + _num = _n; \ + for (_i = 0; _i < 5; _i++) \ + { \ + _barray[_i] = ((_num%128)<<1); \ + _num = _num/128; \ + } \ + for (_i = 4; _i > 0; _i--) \ + { \ + if ((_barray[_i] != 0) || (_started == 1)) \ + { \ + _started = 1; \ + *_v = _barray[_i]|0x1; \ + _v++; \ + } \ + } \ + *_v = _barray[0]|0x0; \ + _v++; \ +} + +#define VBYTE_DEC(_v, _n) \ +{\ + _n = ((*_v>>1)); \ + if ((*_v&0x1) != 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + if ((*_v&0x1)!= 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + if ((*_v&0x1) != 0) \ + { \ + _v++; \ + _n = (_n<<7) + ((*_v>>1)); \ + }\ + }\ + }\ + _v++; \ +} + diff --git a/bitpack.c b/bitpack.c new file mode 100644 index 0000000..e364984 --- /dev/null +++ b/bitpack.c @@ -0,0 +1,34 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack.c - "Integer Compression" binary packing +**/ +#include "bitpack_.h" +#include "bitpack.h" +#define IPPB( __ip,__x, __parm) + +#define PAD8(__x) ( (((__x)+8-1)/8) ) + +unsigned char *bitpack32(unsigned *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } +unsigned char *bitpack16(unsigned short *__restrict__ in, int n, int nb, unsigned char *__restrict__ out) { unsigned char *pout = out+PAD8(n*nb); BITPACK32(in, n, nb, out, 0); return pout; } + diff --git a/bitpack.h b/bitpack.h new file mode 100644 index 0000000..77dee67 --- /dev/null +++ b/bitpack.h @@ -0,0 +1,30 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack.c - "Integer Compression" binary packing +**/ + +unsigned char *bitpack16( unsigned short *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out); +unsigned char *bitpack32( unsigned *__restrict__ in, int n, int nbits, unsigned char *__restrict__ out); + + diff --git a/bitpack64_.h b/bitpack64_.h new file mode 100644 index 0000000..d74b27c --- /dev/null +++ b/bitpack64_.h @@ -0,0 +1,1136 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack64_.h - "Integer Compression" binary packing +**/ + +#define BITBLK32_1(ip, i, op, parm) { ; register uint32_t w;;\ + IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 1;\ + IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 2;\ + IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 3;\ + IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 4;\ + IPPB(ip, i*32+ 5, parm); w |= (uint32_t)SRC(ip, i*32+ 5) << 5;\ + IPPB(ip, i*32+ 6, parm); w |= (uint32_t)SRC(ip, i*32+ 6) << 6;\ + IPPB(ip, i*32+ 7, parm); w |= (uint32_t)SRC(ip, i*32+ 7) << 7;\ + IPPB(ip, i*32+ 8, parm); w |= (uint32_t)SRC(ip, i*32+ 8) << 8;\ + IPPB(ip, i*32+ 9, parm); w |= (uint32_t)SRC(ip, i*32+ 9) << 9;\ + IPPB(ip, i*32+10, parm); w |= (uint32_t)SRC(ip, i*32+10) << 10;\ + IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 11;\ + IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 12;\ + IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 13;\ + IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 14;\ + IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 15;\ + IPPB(ip, i*32+16, parm); w |= (uint32_t)SRC(ip, i*32+16) << 16;\ + IPPB(ip, i*32+17, parm); w |= (uint32_t)SRC(ip, i*32+17) << 17;\ + IPPB(ip, i*32+18, parm); w |= (uint32_t)SRC(ip, i*32+18) << 18;\ + IPPB(ip, i*32+19, parm); w |= (uint32_t)SRC(ip, i*32+19) << 19;\ + IPPB(ip, i*32+20, parm); w |= (uint32_t)SRC(ip, i*32+20) << 20;\ + IPPB(ip, i*32+21, parm); w |= (uint32_t)SRC(ip, i*32+21) << 21;\ + IPPB(ip, i*32+22, parm); w |= (uint32_t)SRC(ip, i*32+22) << 22;\ + IPPB(ip, i*32+23, parm); w |= (uint32_t)SRC(ip, i*32+23) << 23;\ + IPPB(ip, i*32+24, parm); w |= (uint32_t)SRC(ip, i*32+24) << 24;\ + IPPB(ip, i*32+25, parm); w |= (uint32_t)SRC(ip, i*32+25) << 25;\ + IPPB(ip, i*32+26, parm); w |= (uint32_t)SRC(ip, i*32+26) << 26;\ + IPPB(ip, i*32+27, parm); w |= (uint32_t)SRC(ip, i*32+27) << 27;\ + IPPB(ip, i*32+28, parm); w |= (uint32_t)SRC(ip, i*32+28) << 28;\ + IPPB(ip, i*32+29, parm); w |= (uint32_t)SRC(ip, i*32+29) << 29;\ + IPPB(ip, i*32+30, parm); w |= (uint32_t)SRC(ip, i*32+30) << 30;\ + IPPB(ip, i*32+31, parm); w |= (uint32_t)SRC(ip, i*32+31) << 31;*((uint32_t *)op+i*1+ 0) = w;;\ +} + +#define BITPACK64_1(ip, op, parm) { \ + BITBLK32_1(ip, 0, op, parm); SRCI(ip); op += 1*4/sizeof(op[0]);\ +} + +#define BITBLK64_2(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 2;\ + IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 4;\ + IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 6;\ + IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 8;\ + IPPB(ip, i*32+ 5, parm); w |= (uint32_t)SRC(ip, i*32+ 5) << 10;\ + IPPB(ip, i*32+ 6, parm); w |= (uint32_t)SRC(ip, i*32+ 6) << 12;\ + IPPB(ip, i*32+ 7, parm); w |= (uint32_t)SRC(ip, i*32+ 7) << 14;\ + IPPB(ip, i*32+ 8, parm); w |= (uint32_t)SRC(ip, i*32+ 8) << 16;\ + IPPB(ip, i*32+ 9, parm); w |= (uint32_t)SRC(ip, i*32+ 9) << 18;\ + IPPB(ip, i*32+10, parm); w |= (uint32_t)SRC(ip, i*32+10) << 20;\ + IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 22;\ + IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 24;\ + IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 26;\ + IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 28;\ + IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 30;\ + IPPB(ip, i*32+16, parm); w |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); w |= (uint64_t)SRC(ip, i*32+17) << 34;\ + IPPB(ip, i*32+18, parm); w |= (uint64_t)SRC(ip, i*32+18) << 36;\ + IPPB(ip, i*32+19, parm); w |= (uint64_t)SRC(ip, i*32+19) << 38;\ + IPPB(ip, i*32+20, parm); w |= (uint64_t)SRC(ip, i*32+20) << 40;\ + IPPB(ip, i*32+21, parm); w |= (uint64_t)SRC(ip, i*32+21) << 42;\ + IPPB(ip, i*32+22, parm); w |= (uint64_t)SRC(ip, i*32+22) << 44;\ + IPPB(ip, i*32+23, parm); w |= (uint64_t)SRC(ip, i*32+23) << 46;\ + IPPB(ip, i*32+24, parm); w |= (uint64_t)SRC(ip, i*32+24) << 48;\ + IPPB(ip, i*32+25, parm); w |= (uint64_t)SRC(ip, i*32+25) << 50;\ + IPPB(ip, i*32+26, parm); w |= (uint64_t)SRC(ip, i*32+26) << 52;\ + IPPB(ip, i*32+27, parm); w |= (uint64_t)SRC(ip, i*32+27) << 54;\ + IPPB(ip, i*32+28, parm); w |= (uint64_t)SRC(ip, i*32+28) << 56;\ + IPPB(ip, i*32+29, parm); w |= (uint64_t)SRC(ip, i*32+29) << 58;\ + IPPB(ip, i*32+30, parm); w |= (uint64_t)SRC(ip, i*32+30) << 60;\ + IPPB(ip, i*32+31, parm); w |= (uint64_t)SRC(ip, i*32+31) << 62;*((uint64_t *)op+i*1+ 0) = w;;\ +} + +#define BITPACK64_2(ip, op, parm) { \ + BITBLK64_2(ip, 0, op, parm); SRCI(ip); op += 2*4/sizeof(op[0]);\ +} + +#define BITBLK64_3(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*64+ 0, parm); w = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); w |= (uint32_t)SRC(ip, i*64+ 1) << 3;\ + IPPB(ip, i*64+ 2, parm); w |= (uint32_t)SRC(ip, i*64+ 2) << 6;\ + IPPB(ip, i*64+ 3, parm); w |= (uint32_t)SRC(ip, i*64+ 3) << 9;\ + IPPB(ip, i*64+ 4, parm); w |= (uint32_t)SRC(ip, i*64+ 4) << 12;\ + IPPB(ip, i*64+ 5, parm); w |= (uint32_t)SRC(ip, i*64+ 5) << 15;\ + IPPB(ip, i*64+ 6, parm); w |= (uint32_t)SRC(ip, i*64+ 6) << 18;\ + IPPB(ip, i*64+ 7, parm); w |= (uint32_t)SRC(ip, i*64+ 7) << 21;\ + IPPB(ip, i*64+ 8, parm); w |= (uint32_t)SRC(ip, i*64+ 8) << 24;\ + IPPB(ip, i*64+ 9, parm); w |= (uint32_t)SRC(ip, i*64+ 9) << 27;\ + IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 30;\ + IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 33;\ + IPPB(ip, i*64+12, parm); w |= (uint64_t)SRC(ip, i*64+12) << 36;\ + IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 39;\ + IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 42;\ + IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 45;\ + IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 48;\ + IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 51;\ + IPPB(ip, i*64+18, parm); w |= (uint64_t)SRC(ip, i*64+18) << 54;\ + IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 57;\ + IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 60 | (uint64_t)SRC1(ip, i*64+21) << 63;*((uint64_t *)op+i*3+ 0) = w;\ + IPPB(ip, i*64+21, parm); w = (uint32_t)SRC(ip, i*64+21) >> 1;\ + IPPB(ip, i*64+22, parm); w |= (uint32_t)SRC(ip, i*64+22) << 2;\ + IPPB(ip, i*64+23, parm); w |= (uint32_t)SRC(ip, i*64+23) << 5;\ + IPPB(ip, i*64+24, parm); w |= (uint32_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); w |= (uint32_t)SRC(ip, i*64+25) << 11;\ + IPPB(ip, i*64+26, parm); w |= (uint32_t)SRC(ip, i*64+26) << 14;\ + IPPB(ip, i*64+27, parm); w |= (uint32_t)SRC(ip, i*64+27) << 17;\ + IPPB(ip, i*64+28, parm); w |= (uint32_t)SRC(ip, i*64+28) << 20;\ + IPPB(ip, i*64+29, parm); w |= (uint32_t)SRC(ip, i*64+29) << 23;\ + IPPB(ip, i*64+30, parm); w |= (uint32_t)SRC(ip, i*64+30) << 26;\ + IPPB(ip, i*64+31, parm); w |= (uint32_t)SRC(ip, i*64+31) << 29;*((uint64_t *)op+i*3+ 1) = w;;\ +} + +#define BITPACK64_3(ip, op, parm) { \ + BITBLK64_3(ip, 0, op, parm); SRCI(ip); op += 3*4/sizeof(op[0]);\ +} + +#define BITBLK64_4(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*16+ 0, parm); w = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); w |= (uint32_t)SRC(ip, i*16+ 1) << 4;\ + IPPB(ip, i*16+ 2, parm); w |= (uint32_t)SRC(ip, i*16+ 2) << 8;\ + IPPB(ip, i*16+ 3, parm); w |= (uint32_t)SRC(ip, i*16+ 3) << 12;\ + IPPB(ip, i*16+ 4, parm); w |= (uint32_t)SRC(ip, i*16+ 4) << 16;\ + IPPB(ip, i*16+ 5, parm); w |= (uint32_t)SRC(ip, i*16+ 5) << 20;\ + IPPB(ip, i*16+ 6, parm); w |= (uint32_t)SRC(ip, i*16+ 6) << 24;\ + IPPB(ip, i*16+ 7, parm); w |= (uint32_t)SRC(ip, i*16+ 7) << 28;\ + IPPB(ip, i*16+ 8, parm); w |= (uint64_t)SRC(ip, i*16+ 8) << 32;\ + IPPB(ip, i*16+ 9, parm); w |= (uint64_t)SRC(ip, i*16+ 9) << 36;\ + IPPB(ip, i*16+10, parm); w |= (uint64_t)SRC(ip, i*16+10) << 40;\ + IPPB(ip, i*16+11, parm); w |= (uint64_t)SRC(ip, i*16+11) << 44;\ + IPPB(ip, i*16+12, parm); w |= (uint64_t)SRC(ip, i*16+12) << 48;\ + IPPB(ip, i*16+13, parm); w |= (uint64_t)SRC(ip, i*16+13) << 52;\ + IPPB(ip, i*16+14, parm); w |= (uint64_t)SRC(ip, i*16+14) << 56;\ + IPPB(ip, i*16+15, parm); w |= (uint64_t)SRC(ip, i*16+15) << 60;*((uint64_t *)op+i*1+ 0) = w;;\ +} + +#define BITPACK64_4(ip, op, parm) { \ + BITBLK64_4(ip, 0, op, parm);\ + BITBLK64_4(ip, 1, op, parm); SRCI(ip); op += 4*4/sizeof(op[0]);\ +} + +#define BITBLK64_5(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*64+ 0, parm); w = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); w |= (uint32_t)SRC(ip, i*64+ 1) << 5;\ + IPPB(ip, i*64+ 2, parm); w |= (uint32_t)SRC(ip, i*64+ 2) << 10;\ + IPPB(ip, i*64+ 3, parm); w |= (uint32_t)SRC(ip, i*64+ 3) << 15;\ + IPPB(ip, i*64+ 4, parm); w |= (uint32_t)SRC(ip, i*64+ 4) << 20;\ + IPPB(ip, i*64+ 5, parm); w |= (uint32_t)SRC(ip, i*64+ 5) << 25;\ + IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 30;\ + IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 35;\ + IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 40;\ + IPPB(ip, i*64+ 9, parm); w |= (uint64_t)SRC(ip, i*64+ 9) << 45;\ + IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 50;\ + IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 55 | (uint64_t)SRC1(ip, i*64+12) << 60;*((uint64_t *)op+i*5+ 0) = w;\ + IPPB(ip, i*64+12, parm); w = (uint32_t)SRC(ip, i*64+12) >> 4;\ + IPPB(ip, i*64+13, parm); w |= (uint32_t)SRC(ip, i*64+13) << 1;\ + IPPB(ip, i*64+14, parm); w |= (uint32_t)SRC(ip, i*64+14) << 6;\ + IPPB(ip, i*64+15, parm); w |= (uint32_t)SRC(ip, i*64+15) << 11;\ + IPPB(ip, i*64+16, parm); w |= (uint32_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); w |= (uint32_t)SRC(ip, i*64+17) << 21;\ + IPPB(ip, i*64+18, parm); w |= (uint32_t)SRC(ip, i*64+18) << 26;\ + IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 31;\ + IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 36;\ + IPPB(ip, i*64+21, parm); w |= (uint64_t)SRC(ip, i*64+21) << 41;\ + IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 46;\ + IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 51;\ + IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 56 | (uint64_t)SRC1(ip, i*64+25) << 61;*((uint64_t *)op+i*5+ 1) = w;\ + IPPB(ip, i*64+25, parm); w = (uint32_t)SRC(ip, i*64+25) >> 3;\ + IPPB(ip, i*64+26, parm); w |= (uint32_t)SRC(ip, i*64+26) << 2;\ + IPPB(ip, i*64+27, parm); w |= (uint32_t)SRC(ip, i*64+27) << 7;\ + IPPB(ip, i*64+28, parm); w |= (uint32_t)SRC(ip, i*64+28) << 12;\ + IPPB(ip, i*64+29, parm); w |= (uint32_t)SRC(ip, i*64+29) << 17;\ + IPPB(ip, i*64+30, parm); w |= (uint32_t)SRC(ip, i*64+30) << 22;\ + IPPB(ip, i*64+31, parm); w |= (uint32_t)SRC(ip, i*64+31) << 27;*((uint64_t *)op+i*5+ 2) = w;;\ +} + +#define BITPACK64_5(ip, op, parm) { \ + BITBLK64_5(ip, 0, op, parm); SRCI(ip); op += 5*4/sizeof(op[0]);\ +} + +#define BITBLK64_6(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 6;\ + IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 12;\ + IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 18;\ + IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 24;\ + IPPB(ip, i*32+ 5, parm); w |= (uint64_t)SRC(ip, i*32+ 5) << 30;\ + IPPB(ip, i*32+ 6, parm); w |= (uint64_t)SRC(ip, i*32+ 6) << 36;\ + IPPB(ip, i*32+ 7, parm); w |= (uint64_t)SRC(ip, i*32+ 7) << 42;\ + IPPB(ip, i*32+ 8, parm); w |= (uint64_t)SRC(ip, i*32+ 8) << 48;\ + IPPB(ip, i*32+ 9, parm); w |= (uint64_t)SRC(ip, i*32+ 9) << 54 | (uint64_t)SRC1(ip, i*32+10) << 60;*((uint64_t *)op+i*3+ 0) = w;\ + IPPB(ip, i*32+10, parm); w = (uint32_t)SRC(ip, i*32+10) >> 4;\ + IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 2;\ + IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 8;\ + IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 14;\ + IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 20;\ + IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 26;\ + IPPB(ip, i*32+16, parm); w |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); w |= (uint64_t)SRC(ip, i*32+17) << 38;\ + IPPB(ip, i*32+18, parm); w |= (uint64_t)SRC(ip, i*32+18) << 44;\ + IPPB(ip, i*32+19, parm); w |= (uint64_t)SRC(ip, i*32+19) << 50;\ + IPPB(ip, i*32+20, parm); w |= (uint64_t)SRC(ip, i*32+20) << 56 | (uint64_t)SRC1(ip, i*32+21) << 62;*((uint64_t *)op+i*3+ 1) = w;\ + IPPB(ip, i*32+21, parm); w = (uint32_t)SRC(ip, i*32+21) >> 2;\ + IPPB(ip, i*32+22, parm); w |= (uint32_t)SRC(ip, i*32+22) << 4;\ + IPPB(ip, i*32+23, parm); w |= (uint32_t)SRC(ip, i*32+23) << 10;\ + IPPB(ip, i*32+24, parm); w |= (uint32_t)SRC(ip, i*32+24) << 16;\ + IPPB(ip, i*32+25, parm); w |= (uint32_t)SRC(ip, i*32+25) << 22;\ + IPPB(ip, i*32+26, parm); w |= (uint64_t)SRC(ip, i*32+26) << 28;\ + IPPB(ip, i*32+27, parm); w |= (uint64_t)SRC(ip, i*32+27) << 34;\ + IPPB(ip, i*32+28, parm); w |= (uint64_t)SRC(ip, i*32+28) << 40;\ + IPPB(ip, i*32+29, parm); w |= (uint64_t)SRC(ip, i*32+29) << 46;\ + IPPB(ip, i*32+30, parm); w |= (uint64_t)SRC(ip, i*32+30) << 52;\ + IPPB(ip, i*32+31, parm); w |= (uint64_t)SRC(ip, i*32+31) << 58;*((uint64_t *)op+i*3+ 2) = w;;\ +} + +#define BITPACK64_6(ip, op, parm) { \ + BITBLK64_6(ip, 0, op, parm); SRCI(ip); op += 6*4/sizeof(op[0]);\ +} + +#define BITBLK64_7(ip, i, op, parm) { ; register uint64_t w;;\ + IPPB(ip, i*64+ 0, parm); w = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); w |= (uint32_t)SRC(ip, i*64+ 1) << 7;\ + IPPB(ip, i*64+ 2, parm); w |= (uint32_t)SRC(ip, i*64+ 2) << 14;\ + IPPB(ip, i*64+ 3, parm); w |= (uint32_t)SRC(ip, i*64+ 3) << 21;\ + IPPB(ip, i*64+ 4, parm); w |= (uint64_t)SRC(ip, i*64+ 4) << 28;\ + IPPB(ip, i*64+ 5, parm); w |= (uint64_t)SRC(ip, i*64+ 5) << 35;\ + IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 42;\ + IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 49;\ + IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 56 | (uint64_t)SRC1(ip, i*64+9) << 63;*((uint64_t *)op+i*7+ 0) = w;\ + IPPB(ip, i*64+ 9, parm); w = (uint32_t)SRC(ip, i*64+ 9) >> 1;\ + IPPB(ip, i*64+10, parm); w |= (uint32_t)SRC(ip, i*64+10) << 6;\ + IPPB(ip, i*64+11, parm); w |= (uint32_t)SRC(ip, i*64+11) << 13;\ + IPPB(ip, i*64+12, parm); w |= (uint32_t)SRC(ip, i*64+12) << 20;\ + IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 27;\ + IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 34;\ + IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 41;\ + IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 48;\ + IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 55 | (uint64_t)SRC1(ip, i*64+18) << 62;*((uint64_t *)op+i*7+ 1) = w;\ + IPPB(ip, i*64+18, parm); w = (uint32_t)SRC(ip, i*64+18) >> 2;\ + IPPB(ip, i*64+19, parm); w |= (uint32_t)SRC(ip, i*64+19) << 5;\ + IPPB(ip, i*64+20, parm); w |= (uint32_t)SRC(ip, i*64+20) << 12;\ + IPPB(ip, i*64+21, parm); w |= (uint32_t)SRC(ip, i*64+21) << 19;\ + IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 26;\ + IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 33;\ + IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 40;\ + IPPB(ip, i*64+25, parm); w |= (uint64_t)SRC(ip, i*64+25) << 47;\ + IPPB(ip, i*64+26, parm); w |= (uint64_t)SRC(ip, i*64+26) << 54 | (uint64_t)SRC1(ip, i*64+27) << 61;*((uint64_t *)op+i*7+ 2) = w;\ + IPPB(ip, i*64+27, parm); w = (uint32_t)SRC(ip, i*64+27) >> 3;\ + IPPB(ip, i*64+28, parm); w |= (uint32_t)SRC(ip, i*64+28) << 4;\ + IPPB(ip, i*64+29, parm); w |= (uint32_t)SRC(ip, i*64+29) << 11;\ + IPPB(ip, i*64+30, parm); w |= (uint32_t)SRC(ip, i*64+30) << 18;\ + IPPB(ip, i*64+31, parm); w |= (uint32_t)SRC(ip, i*64+31) << 25;*((uint64_t *)op+i*7+ 3) = w;;\ +} + +#define BITPACK64_7(ip, op, parm) { \ + BITBLK64_7(ip, 0, op, parm); SRCI(ip); op += 7*4/sizeof(op[0]);\ +} + +#define BITBLK64_8(ip, i, op, parm) { ;\ + IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*1+ 0) = (uint32_t)SRC(ip, i*8+ 0) ;\ + IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*1+ 0) |= (uint32_t)SRC(ip, i*8+ 1) << 8;\ + IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*1+ 0) |= (uint32_t)SRC(ip, i*8+ 2) << 16;\ + IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*1+ 0) |= (uint32_t)SRC(ip, i*8+ 3) << 24;\ + IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 4) << 32;\ + IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 5) << 40;\ + IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 6) << 48;\ + IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 7) << 56;\ +} + +#define BITPACK64_8(ip, op, parm) { \ + BITBLK64_8(ip, 0, op, parm);\ + BITBLK64_8(ip, 1, op, parm);\ + BITBLK64_8(ip, 2, op, parm);\ + BITBLK64_8(ip, 3, op, parm); SRCI(ip); op += 8*4/sizeof(op[0]);\ +} + +#define BITBLK64_9(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*9+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 9;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*9+ 0) |= (uint32_t)SRC(ip, i*64+ 2) << 18;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 27;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 4) << 36;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 5) << 45;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 6) << 54 | (uint64_t)SRC1(ip, i*64+7) << 63;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*9+ 1) = (uint32_t)SRC(ip, i*64+ 7) >> 1;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*9+ 1) |= (uint32_t)SRC(ip, i*64+ 8) << 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*9+ 1) |= (uint32_t)SRC(ip, i*64+ 9) << 17;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+10) << 26;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+11) << 35;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+12) << 44;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+13) << 53 | (uint64_t)SRC1(ip, i*64+14) << 62;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*9+ 2) = (uint32_t)SRC(ip, i*64+14) >> 2;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*9+ 2) |= (uint32_t)SRC(ip, i*64+15) << 7;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*9+ 2) |= (uint32_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+17) << 25;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+18) << 34;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+19) << 43;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+20) << 52 | (uint64_t)SRC1(ip, i*64+21) << 61;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*9+ 3) = (uint32_t)SRC(ip, i*64+21) >> 3;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*9+ 3) |= (uint32_t)SRC(ip, i*64+22) << 6;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*9+ 3) |= (uint32_t)SRC(ip, i*64+23) << 15;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+24) << 24;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+25) << 33;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+26) << 42;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+27) << 51 | (uint64_t)SRC1(ip, i*64+28) << 60;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*9+ 4) = (uint32_t)SRC(ip, i*64+28) >> 4;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*64+29) << 5;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*64+30) << 14;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*64+31) << 23;\ +} + +#define BITPACK64_9(ip, op, parm) { \ + BITBLK64_9(ip, 0, op, parm); SRCI(ip); op += 9*4/sizeof(op[0]);\ +} + +#define BITBLK64_10(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*5+ 0) |= (uint32_t)SRC(ip, i*32+ 1) << 10;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*5+ 0) |= (uint32_t)SRC(ip, i*32+ 2) << 20;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 3) << 30;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 4) << 40;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 5) << 50 | (uint64_t)SRC1(ip, i*32+6) << 60;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*5+ 1) = (uint32_t)SRC(ip, i*32+ 6) >> 4;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*5+ 1) |= (uint32_t)SRC(ip, i*32+ 7) << 6;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*5+ 1) |= (uint32_t)SRC(ip, i*32+ 8) << 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+ 9) << 26;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+10) << 36;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+11) << 46 | (uint64_t)SRC1(ip, i*32+12) << 56;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*5+ 2) = (uint32_t)SRC(ip, i*32+12) >> 8;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*32+13) << 2;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*32+14) << 12;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*32+15) << 22;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+17) << 42;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+18) << 52 | (uint64_t)SRC1(ip, i*32+19) << 62;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*5+ 3) = (uint32_t)SRC(ip, i*32+19) >> 2;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*5+ 3) |= (uint32_t)SRC(ip, i*32+20) << 8;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*5+ 3) |= (uint32_t)SRC(ip, i*32+21) << 18;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+22) << 28;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+23) << 38;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+24) << 48 | (uint64_t)SRC1(ip, i*32+25) << 58;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*5+ 4) = (uint32_t)SRC(ip, i*32+25) >> 6;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*5+ 4) |= (uint32_t)SRC(ip, i*32+26) << 4;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*5+ 4) |= (uint32_t)SRC(ip, i*32+27) << 14;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+28) << 24;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+29) << 34;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+30) << 44;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+31) << 54;\ +} + +#define BITPACK64_10(ip, op, parm) { \ + BITBLK64_10(ip, 0, op, parm); SRCI(ip); op += 10*4/sizeof(op[0]);\ +} + +#define BITBLK64_11(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*11+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 11;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 22;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 33;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 4) << 44 | (uint64_t)SRC1(ip, i*64+5) << 55;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*11+ 1) = (uint32_t)SRC(ip, i*64+ 5) >> 9;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*11+ 1) |= (uint32_t)SRC(ip, i*64+ 6) << 2;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*11+ 1) |= (uint32_t)SRC(ip, i*64+ 7) << 13;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 9) << 35;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+10) << 46 | (uint64_t)SRC1(ip, i*64+11) << 57;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*11+ 2) = (uint32_t)SRC(ip, i*64+11) >> 7;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*11+ 2) |= (uint32_t)SRC(ip, i*64+12) << 4;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*11+ 2) |= (uint32_t)SRC(ip, i*64+13) << 15;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+14) << 26;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+15) << 37;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+16) << 48 | (uint64_t)SRC1(ip, i*64+17) << 59;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*11+ 3) = (uint32_t)SRC(ip, i*64+17) >> 5;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*11+ 3) |= (uint32_t)SRC(ip, i*64+18) << 6;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*11+ 3) |= (uint32_t)SRC(ip, i*64+19) << 17;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+20) << 28;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+21) << 39;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+22) << 50 | (uint64_t)SRC1(ip, i*64+23) << 61;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*11+ 4) = (uint32_t)SRC(ip, i*64+23) >> 3;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*11+ 4) |= (uint32_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*11+ 4) |= (uint32_t)SRC(ip, i*64+25) << 19;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+26) << 30;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+27) << 41;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+28) << 52 | (uint64_t)SRC1(ip, i*64+29) << 63;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*11+ 5) = (uint32_t)SRC(ip, i*64+29) >> 1;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*11+ 5) |= (uint32_t)SRC(ip, i*64+30) << 10;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*11+ 5) |= (uint32_t)SRC(ip, i*64+31) << 21;\ +} + +#define BITPACK64_11(ip, op, parm) { \ + BITBLK64_11(ip, 0, op, parm); SRCI(ip); op += 11*4/sizeof(op[0]);\ +} + +#define BITBLK64_12(ip, i, op, parm) { ;\ + IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*3+ 0) |= (uint32_t)SRC(ip, i*16+ 1) << 12;\ + IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 2) << 24;\ + IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 3) << 36;\ + IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 4) << 48 | (uint64_t)SRC1(ip, i*16+5) << 60;\ + IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*3+ 1) = (uint32_t)SRC(ip, i*16+ 5) >> 4;\ + IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*3+ 1) |= (uint32_t)SRC(ip, i*16+ 6) << 8;\ + IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*3+ 1) |= (uint32_t)SRC(ip, i*16+ 7) << 20;\ + IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 8) << 32;\ + IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 9) << 44 | (uint64_t)SRC1(ip, i*16+10) << 56;\ + IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*3+ 2) = (uint32_t)SRC(ip, i*16+10) >> 8;\ + IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*3+ 2) |= (uint32_t)SRC(ip, i*16+11) << 4;\ + IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*3+ 2) |= (uint32_t)SRC(ip, i*16+12) << 16;\ + IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+13) << 28;\ + IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+14) << 40;\ + IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+15) << 52;\ +} + +#define BITPACK64_12(ip, op, parm) { \ + BITBLK64_12(ip, 0, op, parm);\ + BITBLK64_12(ip, 1, op, parm); SRCI(ip); op += 12*4/sizeof(op[0]);\ +} + +#define BITBLK64_13(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*13+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 13;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 26;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 39 | (uint64_t)SRC1(ip, i*64+4) << 52;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*13+ 1) = (uint32_t)SRC(ip, i*64+ 4) >> 12;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*13+ 1) |= (uint32_t)SRC(ip, i*64+ 5) << 1;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*13+ 1) |= (uint32_t)SRC(ip, i*64+ 6) << 14;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 27;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 40 | (uint64_t)SRC1(ip, i*64+9) << 53;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*13+ 2) = (uint32_t)SRC(ip, i*64+ 9) >> 11;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*13+ 2) |= (uint32_t)SRC(ip, i*64+10) << 2;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*13+ 2) |= (uint32_t)SRC(ip, i*64+11) << 15;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+12) << 28;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+13) << 41 | (uint64_t)SRC1(ip, i*64+14) << 54;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*13+ 3) = (uint32_t)SRC(ip, i*64+14) >> 10;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*13+ 3) |= (uint32_t)SRC(ip, i*64+15) << 3;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*13+ 3) |= (uint32_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+17) << 29;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+18) << 42 | (uint64_t)SRC1(ip, i*64+19) << 55;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*13+ 4) = (uint32_t)SRC(ip, i*64+19) >> 9;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*13+ 4) |= (uint32_t)SRC(ip, i*64+20) << 4;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*13+ 4) |= (uint32_t)SRC(ip, i*64+21) << 17;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+22) << 30;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+23) << 43 | (uint64_t)SRC1(ip, i*64+24) << 56;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*13+ 5) = (uint32_t)SRC(ip, i*64+24) >> 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*13+ 5) |= (uint32_t)SRC(ip, i*64+25) << 5;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*13+ 5) |= (uint32_t)SRC(ip, i*64+26) << 18;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+27) << 31;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+28) << 44 | (uint64_t)SRC1(ip, i*64+29) << 57;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*13+ 6) = (uint32_t)SRC(ip, i*64+29) >> 7;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*13+ 6) |= (uint32_t)SRC(ip, i*64+30) << 6;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*13+ 6) |= (uint32_t)SRC(ip, i*64+31) << 19;\ +} + +#define BITPACK64_13(ip, op, parm) { \ + BITBLK64_13(ip, 0, op, parm); SRCI(ip); op += 13*4/sizeof(op[0]);\ +} + +#define BITBLK64_14(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*7+ 0) |= (uint32_t)SRC(ip, i*32+ 1) << 14;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 28;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 3) << 42 | (uint64_t)SRC1(ip, i*32+4) << 56;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*7+ 1) = (uint32_t)SRC(ip, i*32+ 4) >> 8;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*7+ 1) |= (uint32_t)SRC(ip, i*32+ 5) << 6;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 6) << 20;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 7) << 34;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 8) << 48 | (uint64_t)SRC1(ip, i*32+9) << 62;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*7+ 2) = (uint32_t)SRC(ip, i*32+ 9) >> 2;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*7+ 2) |= (uint32_t)SRC(ip, i*32+10) << 12;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+11) << 26;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+12) << 40 | (uint64_t)SRC1(ip, i*32+13) << 54;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*7+ 3) = (uint32_t)SRC(ip, i*32+13) >> 10;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*7+ 3) |= (uint32_t)SRC(ip, i*32+14) << 4;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*7+ 3) |= (uint32_t)SRC(ip, i*32+15) << 18;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+16) << 32;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+17) << 46 | (uint64_t)SRC1(ip, i*32+18) << 60;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*7+ 4) = (uint32_t)SRC(ip, i*32+18) >> 4;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*7+ 4) |= (uint32_t)SRC(ip, i*32+19) << 10;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+20) << 24;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+21) << 38 | (uint64_t)SRC1(ip, i*32+22) << 52;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*7+ 5) = (uint32_t)SRC(ip, i*32+22) >> 12;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*7+ 5) |= (uint32_t)SRC(ip, i*32+23) << 2;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*7+ 5) |= (uint32_t)SRC(ip, i*32+24) << 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+25) << 30;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+26) << 44 | (uint64_t)SRC1(ip, i*32+27) << 58;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*7+ 6) = (uint32_t)SRC(ip, i*32+27) >> 6;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*7+ 6) |= (uint32_t)SRC(ip, i*32+28) << 8;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+29) << 22;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+30) << 36;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+31) << 50;\ +} + +#define BITPACK64_14(ip, op, parm) { \ + BITBLK64_14(ip, 0, op, parm); SRCI(ip); op += 14*4/sizeof(op[0]);\ +} + +#define BITBLK64_15(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*15+ 0) |= (uint32_t)SRC(ip, i*64+ 1) << 15;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 30;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 45 | (uint64_t)SRC1(ip, i*64+4) << 60;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*15+ 1) = (uint32_t)SRC(ip, i*64+ 4) >> 4;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*15+ 1) |= (uint32_t)SRC(ip, i*64+ 5) << 11;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 26;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 41 | (uint64_t)SRC1(ip, i*64+8) << 56;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*15+ 2) = (uint32_t)SRC(ip, i*64+ 8) >> 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*15+ 2) |= (uint32_t)SRC(ip, i*64+ 9) << 7;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+10) << 22;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+11) << 37 | (uint64_t)SRC1(ip, i*64+12) << 52;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*15+ 3) = (uint32_t)SRC(ip, i*64+12) >> 12;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*15+ 3) |= (uint32_t)SRC(ip, i*64+13) << 3;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+14) << 18;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+15) << 33;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+16) << 48 | (uint64_t)SRC1(ip, i*64+17) << 63;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*15+ 4) = (uint32_t)SRC(ip, i*64+17) >> 1;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*15+ 4) |= (uint32_t)SRC(ip, i*64+18) << 14;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+19) << 29;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+20) << 44 | (uint64_t)SRC1(ip, i*64+21) << 59;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*15+ 5) = (uint32_t)SRC(ip, i*64+21) >> 5;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*15+ 5) |= (uint32_t)SRC(ip, i*64+22) << 10;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+23) << 25;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+24) << 40 | (uint64_t)SRC1(ip, i*64+25) << 55;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*15+ 6) = (uint32_t)SRC(ip, i*64+25) >> 9;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*15+ 6) |= (uint32_t)SRC(ip, i*64+26) << 6;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+27) << 21;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+28) << 36 | (uint64_t)SRC1(ip, i*64+29) << 51;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*15+ 7) = (uint32_t)SRC(ip, i*64+29) >> 13;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*15+ 7) |= (uint32_t)SRC(ip, i*64+30) << 2;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*15+ 7) |= (uint32_t)SRC(ip, i*64+31) << 17;\ +} + +#define BITPACK64_15(ip, op, parm) { \ + BITBLK64_15(ip, 0, op, parm); SRCI(ip); op += 15*4/sizeof(op[0]);\ +} + +#define BITBLK64_16(ip, i, op, parm) { \ + IPPB(ip, i*4+ 0, parm); *(uint16_t *)(op+i*8+ 0) = SRC(ip, i*4+ 0);\ + IPPB(ip, i*4+ 1, parm); *(uint16_t *)(op+i*8+ 2) = SRC(ip, i*4+ 1);\ + IPPB(ip, i*4+ 2, parm); *(uint16_t *)(op+i*8+ 4) = SRC(ip, i*4+ 2);\ + IPPB(ip, i*4+ 3, parm); *(uint16_t *)(op+i*8+ 6) = SRC(ip, i*4+ 3);;\ +} + +#define BITPACK64_16(ip, op, parm) { \ + BITBLK64_16(ip, 0, op, parm);\ + BITBLK64_16(ip, 1, op, parm);\ + BITBLK64_16(ip, 2, op, parm);\ + BITBLK64_16(ip, 3, op, parm);\ + BITBLK64_16(ip, 4, op, parm);\ + BITBLK64_16(ip, 5, op, parm);\ + BITBLK64_16(ip, 6, op, parm);\ + BITBLK64_16(ip, 7, op, parm); SRCI(ip); op += 16*4/sizeof(op[0]);\ +} + +#define BITBLK64_17(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*17+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*17+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 17;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*17+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 34 | (uint64_t)SRC1(ip, i*64+3) << 51;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*17+ 1) = (uint32_t)SRC(ip, i*64+ 3) >> 13;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*17+ 1) |= (uint32_t)SRC(ip, i*64+ 4) << 4;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 21;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 38 | (uint64_t)SRC1(ip, i*64+7) << 55;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*17+ 2) = (uint32_t)SRC(ip, i*64+ 7) >> 9;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*17+ 2) |= (uint32_t)SRC(ip, i*64+ 8) << 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 25;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+10) << 42 | (uint64_t)SRC1(ip, i*64+11) << 59;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*17+ 3) = (uint32_t)SRC(ip, i*64+11) >> 5;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*17+ 3) |= (uint32_t)SRC(ip, i*64+12) << 12;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+13) << 29;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+14) << 46 | (uint64_t)SRC1(ip, i*64+15) << 63;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*17+ 4) = (uint32_t)SRC(ip, i*64+15) >> 1;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*64+17) << 33 | (uint64_t)SRC1(ip, i*64+18) << 50;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*17+ 5) = (uint32_t)SRC(ip, i*64+18) >> 14;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*17+ 5) |= (uint32_t)SRC(ip, i*64+19) << 3;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+20) << 20;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+21) << 37 | (uint64_t)SRC1(ip, i*64+22) << 54;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*17+ 6) = (uint32_t)SRC(ip, i*64+22) >> 10;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*17+ 6) |= (uint32_t)SRC(ip, i*64+23) << 7;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+24) << 24;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+25) << 41 | (uint64_t)SRC1(ip, i*64+26) << 58;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*17+ 7) = (uint32_t)SRC(ip, i*64+26) >> 6;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*17+ 7) |= (uint32_t)SRC(ip, i*64+27) << 11;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+28) << 28;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+29) << 45 | (uint64_t)SRC1(ip, i*64+30) << 62;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*17+ 8) = (uint32_t)SRC(ip, i*64+30) >> 2;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*17+ 8) |= (uint32_t)SRC(ip, i*64+31) << 15;\ +} + +#define BITPACK64_17(ip, op, parm) { \ + BITBLK64_17(ip, 0, op, parm); SRCI(ip); op += 17*4/sizeof(op[0]);\ +} + +#define BITBLK64_18(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 18;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 36 | (uint64_t)SRC1(ip, i*32+3) << 54;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*9+ 1) = (uint32_t)SRC(ip, i*32+ 3) >> 10;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*9+ 1) |= (uint32_t)SRC(ip, i*32+ 4) << 8;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 5) << 26;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 6) << 44 | (uint64_t)SRC1(ip, i*32+7) << 62;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*9+ 2) = (uint32_t)SRC(ip, i*32+ 7) >> 2;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*32+ 8) << 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*32+ 9) << 34 | (uint64_t)SRC1(ip, i*32+10) << 52;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*9+ 3) = (uint32_t)SRC(ip, i*32+10) >> 12;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*9+ 3) |= (uint32_t)SRC(ip, i*32+11) << 6;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+12) << 24;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+13) << 42 | (uint64_t)SRC1(ip, i*32+14) << 60;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*9+ 4) = (uint32_t)SRC(ip, i*32+14) >> 4;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*9+ 4) |= (uint32_t)SRC(ip, i*32+15) << 14;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 50;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*9+ 5) = (uint32_t)SRC(ip, i*32+17) >> 14;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*9+ 5) |= (uint32_t)SRC(ip, i*32+18) << 4;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+19) << 22;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+20) << 40 | (uint64_t)SRC1(ip, i*32+21) << 58;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*9+ 6) = (uint32_t)SRC(ip, i*32+21) >> 6;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*9+ 6) |= (uint32_t)SRC(ip, i*32+22) << 12;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*9+ 6) |= (uint64_t)SRC(ip, i*32+23) << 30 | (uint64_t)SRC1(ip, i*32+24) << 48;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*9+ 7) = (uint32_t)SRC(ip, i*32+24) >> 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*9+ 7) |= (uint32_t)SRC(ip, i*32+25) << 2;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+26) << 20;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+27) << 38 | (uint64_t)SRC1(ip, i*32+28) << 56;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*9+ 8) = (uint32_t)SRC(ip, i*32+28) >> 8;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*9+ 8) |= (uint32_t)SRC(ip, i*32+29) << 10;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+30) << 28;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+31) << 46;\ +} + +#define BITPACK64_18(ip, op, parm) { \ + BITBLK64_18(ip, 0, op, parm); SRCI(ip); op += 18*4/sizeof(op[0]);\ +} + +#define BITBLK64_19(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*19+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*19+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 19;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*19+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 38 | (uint64_t)SRC1(ip, i*64+3) << 57;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*19+ 1) = (uint32_t)SRC(ip, i*64+ 3) >> 7;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*19+ 1) |= (uint32_t)SRC(ip, i*64+ 4) << 12;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*19+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 31 | (uint64_t)SRC1(ip, i*64+6) << 50;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*19+ 2) = (uint32_t)SRC(ip, i*64+ 6) >> 14;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*19+ 2) |= (uint32_t)SRC(ip, i*64+ 7) << 5;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 43 | (uint64_t)SRC1(ip, i*64+10) << 62;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*19+ 3) = (uint32_t)SRC(ip, i*64+10) >> 2;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*19+ 3) |= (uint64_t)SRC(ip, i*64+11) << 17;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*19+ 3) |= (uint64_t)SRC(ip, i*64+12) << 36 | (uint64_t)SRC1(ip, i*64+13) << 55;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*19+ 4) = (uint32_t)SRC(ip, i*64+13) >> 9;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*19+ 4) |= (uint32_t)SRC(ip, i*64+14) << 10;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*19+ 4) |= (uint64_t)SRC(ip, i*64+15) << 29 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*19+ 5) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*19+ 5) |= (uint32_t)SRC(ip, i*64+17) << 3;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+18) << 22;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+19) << 41 | (uint64_t)SRC1(ip, i*64+20) << 60;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*19+ 6) = (uint32_t)SRC(ip, i*64+20) >> 4;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*19+ 6) |= (uint64_t)SRC(ip, i*64+21) << 15;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*19+ 6) |= (uint64_t)SRC(ip, i*64+22) << 34 | (uint64_t)SRC1(ip, i*64+23) << 53;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*19+ 7) = (uint32_t)SRC(ip, i*64+23) >> 11;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*19+ 7) |= (uint32_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*19+ 7) |= (uint64_t)SRC(ip, i*64+25) << 27 | (uint64_t)SRC1(ip, i*64+26) << 46;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*19+ 8) = (uint32_t)SRC(ip, i*64+26) >> 18;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*19+ 8) |= (uint32_t)SRC(ip, i*64+27) << 1;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+28) << 20;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+29) << 39 | (uint64_t)SRC1(ip, i*64+30) << 58;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*19+ 9) = (uint32_t)SRC(ip, i*64+30) >> 6;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*19+ 9) |= (uint32_t)SRC(ip, i*64+31) << 13;\ +} + +#define BITPACK64_19(ip, op, parm) { \ + BITBLK64_19(ip, 0, op, parm); SRCI(ip); op += 19*4/sizeof(op[0]);\ +} + +#define BITBLK64_20(ip, i, op, parm) { ;\ + IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 20;\ + IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*16+ 2) << 40 | (uint64_t)SRC1(ip, i*16+3) << 60;\ + IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*5+ 1) = (uint32_t)SRC(ip, i*16+ 3) >> 4;\ + IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*16+ 4) << 16;\ + IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*16+ 5) << 36 | (uint64_t)SRC1(ip, i*16+6) << 56;\ + IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*5+ 2) = (uint32_t)SRC(ip, i*16+ 6) >> 8;\ + IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*5+ 2) |= (uint32_t)SRC(ip, i*16+ 7) << 12;\ + IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*16+ 8) << 32 | (uint64_t)SRC1(ip, i*16+9) << 52;\ + IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*5+ 3) = (uint32_t)SRC(ip, i*16+ 9) >> 12;\ + IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*5+ 3) |= (uint32_t)SRC(ip, i*16+10) << 8;\ + IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*16+11) << 28 | (uint64_t)SRC1(ip, i*16+12) << 48;\ + IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*5+ 4) = (uint32_t)SRC(ip, i*16+12) >> 16;\ + IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*5+ 4) |= (uint32_t)SRC(ip, i*16+13) << 4;\ + IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+14) << 24;\ + IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+15) << 44;\ +} + +#define BITPACK64_20(ip, op, parm) { \ + BITBLK64_20(ip, 0, op, parm);\ + BITBLK64_20(ip, 1, op, parm); SRCI(ip); op += 20*4/sizeof(op[0]);\ +} + +#define BITBLK64_21(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*21+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*21+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 21;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*21+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 42 | (uint64_t)SRC1(ip, i*64+3) << 63;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*21+ 1) = (uint32_t)SRC(ip, i*64+ 3) >> 1;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 20;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 41 | (uint64_t)SRC1(ip, i*64+6) << 62;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*21+ 2) = (uint32_t)SRC(ip, i*64+ 6) >> 2;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*21+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 19;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*21+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 40 | (uint64_t)SRC1(ip, i*64+9) << 61;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*21+ 3) = (uint32_t)SRC(ip, i*64+ 9) >> 3;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*64+10) << 18;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*64+11) << 39 | (uint64_t)SRC1(ip, i*64+12) << 60;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*21+ 4) = (uint32_t)SRC(ip, i*64+12) >> 4;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*21+ 4) |= (uint64_t)SRC(ip, i*64+13) << 17;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*21+ 4) |= (uint64_t)SRC(ip, i*64+14) << 38 | (uint64_t)SRC1(ip, i*64+15) << 59;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*21+ 5) = (uint32_t)SRC(ip, i*64+15) >> 5;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*64+16) << 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*64+17) << 37 | (uint64_t)SRC1(ip, i*64+18) << 58;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*21+ 6) = (uint32_t)SRC(ip, i*64+18) >> 6;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*21+ 6) |= (uint64_t)SRC(ip, i*64+19) << 15;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*21+ 6) |= (uint64_t)SRC(ip, i*64+20) << 36 | (uint64_t)SRC1(ip, i*64+21) << 57;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*21+ 7) = (uint32_t)SRC(ip, i*64+21) >> 7;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*64+22) << 14;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*64+23) << 35 | (uint64_t)SRC1(ip, i*64+24) << 56;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*21+ 8) = (uint32_t)SRC(ip, i*64+24) >> 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*21+ 8) |= (uint64_t)SRC(ip, i*64+25) << 13;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*21+ 8) |= (uint64_t)SRC(ip, i*64+26) << 34 | (uint64_t)SRC1(ip, i*64+27) << 55;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*21+ 9) = (uint32_t)SRC(ip, i*64+27) >> 9;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*64+28) << 12;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*64+29) << 33 | (uint64_t)SRC1(ip, i*64+30) << 54;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*21+10) = (uint32_t)SRC(ip, i*64+30) >> 10;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*21+10) |= (uint32_t)SRC(ip, i*64+31) << 11;\ +} + +#define BITPACK64_21(ip, op, parm) { \ + BITBLK64_21(ip, 0, op, parm); SRCI(ip); op += 21*4/sizeof(op[0]);\ +} + +#define BITBLK64_22(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 22 | (uint64_t)SRC1(ip, i*32+2) << 44;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*11+ 1) = (uint32_t)SRC(ip, i*32+ 2) >> 20;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*11+ 1) |= (uint32_t)SRC(ip, i*32+ 3) << 2;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*32+ 4) << 24 | (uint64_t)SRC1(ip, i*32+5) << 46;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*11+ 2) = (uint32_t)SRC(ip, i*32+ 5) >> 18;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*11+ 2) |= (uint32_t)SRC(ip, i*32+ 6) << 4;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*32+ 7) << 26 | (uint64_t)SRC1(ip, i*32+8) << 48;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*11+ 3) = (uint32_t)SRC(ip, i*32+ 8) >> 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*11+ 3) |= (uint32_t)SRC(ip, i*32+ 9) << 6;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*32+10) << 28 | (uint64_t)SRC1(ip, i*32+11) << 50;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*11+ 4) = (uint32_t)SRC(ip, i*32+11) >> 14;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*11+ 4) |= (uint32_t)SRC(ip, i*32+12) << 8;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*32+13) << 30 | (uint64_t)SRC1(ip, i*32+14) << 52;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*11+ 5) = (uint32_t)SRC(ip, i*32+14) >> 12;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*11+ 5) |= (uint32_t)SRC(ip, i*32+15) << 10;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*11+ 5) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 54;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*11+ 6) = (uint32_t)SRC(ip, i*32+17) >> 10;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*32+18) << 12;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*32+19) << 34 | (uint64_t)SRC1(ip, i*32+20) << 56;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*11+ 7) = (uint32_t)SRC(ip, i*32+20) >> 8;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*11+ 7) |= (uint64_t)SRC(ip, i*32+21) << 14;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*11+ 7) |= (uint64_t)SRC(ip, i*32+22) << 36 | (uint64_t)SRC1(ip, i*32+23) << 58;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*11+ 8) = (uint32_t)SRC(ip, i*32+23) >> 6;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*32+24) << 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*32+25) << 38 | (uint64_t)SRC1(ip, i*32+26) << 60;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*11+ 9) = (uint32_t)SRC(ip, i*32+26) >> 4;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*11+ 9) |= (uint64_t)SRC(ip, i*32+27) << 18;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*11+ 9) |= (uint64_t)SRC(ip, i*32+28) << 40 | (uint64_t)SRC1(ip, i*32+29) << 62;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*11+10) = (uint32_t)SRC(ip, i*32+29) >> 2;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*32+30) << 20;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*32+31) << 42;\ +} + +#define BITPACK64_22(ip, op, parm) { \ + BITBLK64_22(ip, 0, op, parm); SRCI(ip); op += 22*4/sizeof(op[0]);\ +} + +#define BITBLK64_23(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*23+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*23+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 23 | (uint64_t)SRC1(ip, i*64+2) << 46;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*23+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 18;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*23+ 1) |= (uint32_t)SRC(ip, i*64+ 3) << 5;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*23+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 28 | (uint64_t)SRC1(ip, i*64+5) << 51;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*23+ 2) = (uint32_t)SRC(ip, i*64+ 5) >> 13;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 10;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 33 | (uint64_t)SRC1(ip, i*64+8) << 56;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*23+ 3) = (uint32_t)SRC(ip, i*64+ 8) >> 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*23+ 3) |= (uint64_t)SRC(ip, i*64+ 9) << 15;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*23+ 3) |= (uint64_t)SRC(ip, i*64+10) << 38 | (uint64_t)SRC1(ip, i*64+11) << 61;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*23+ 4) = (uint32_t)SRC(ip, i*64+11) >> 3;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*23+ 4) |= (uint64_t)SRC(ip, i*64+12) << 20 | (uint64_t)SRC1(ip, i*64+13) << 43;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*23+ 5) = (uint32_t)SRC(ip, i*64+13) >> 21;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*23+ 5) |= (uint32_t)SRC(ip, i*64+14) << 2;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*23+ 5) |= (uint64_t)SRC(ip, i*64+15) << 25 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*23+ 6) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*23+ 6) |= (uint32_t)SRC(ip, i*64+17) << 7;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*23+ 6) |= (uint64_t)SRC(ip, i*64+18) << 30 | (uint64_t)SRC1(ip, i*64+19) << 53;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*23+ 7) = (uint32_t)SRC(ip, i*64+19) >> 11;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*64+20) << 12;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*64+21) << 35 | (uint64_t)SRC1(ip, i*64+22) << 58;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*23+ 8) = (uint32_t)SRC(ip, i*64+22) >> 6;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*23+ 8) |= (uint64_t)SRC(ip, i*64+23) << 17;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*23+ 8) |= (uint64_t)SRC(ip, i*64+24) << 40 | (uint64_t)SRC1(ip, i*64+25) << 63;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*23+ 9) = (uint32_t)SRC(ip, i*64+25) >> 1;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*23+ 9) |= (uint64_t)SRC(ip, i*64+26) << 22 | (uint64_t)SRC1(ip, i*64+27) << 45;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*23+10) = (uint32_t)SRC(ip, i*64+27) >> 19;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*23+10) |= (uint32_t)SRC(ip, i*64+28) << 4;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*23+10) |= (uint64_t)SRC(ip, i*64+29) << 27 | (uint64_t)SRC1(ip, i*64+30) << 50;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*23+11) = (uint32_t)SRC(ip, i*64+30) >> 14;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*23+11) |= (uint32_t)SRC(ip, i*64+31) << 9;\ +} + +#define BITPACK64_23(ip, op, parm) { \ + BITBLK64_23(ip, 0, op, parm); SRCI(ip); op += 23*4/sizeof(op[0]);\ +} + +#define BITBLK64_24(ip, i, op, parm) { ;\ + IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint32_t)SRC(ip, i*8+ 0) ;\ + IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*8+ 1) << 24 | (uint64_t)SRC1(ip, i*8+2) << 48;\ + IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*3+ 1) = (uint32_t)SRC(ip, i*8+ 2) >> 16;\ + IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*3+ 1) |= (uint32_t)SRC(ip, i*8+ 3) << 8;\ + IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*8+ 4) << 32 | (uint64_t)SRC1(ip, i*8+5) << 56;\ + IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*3+ 2) = (uint32_t)SRC(ip, i*8+ 5) >> 8;\ + IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*8+ 6) << 16;\ + IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*8+ 7) << 40;\ +} + +#define BITPACK64_24(ip, op, parm) { \ + BITBLK64_24(ip, 0, op, parm);\ + BITBLK64_24(ip, 1, op, parm);\ + BITBLK64_24(ip, 2, op, parm);\ + BITBLK64_24(ip, 3, op, parm); SRCI(ip); op += 24*4/sizeof(op[0]);\ +} + +#define BITBLK64_25(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*25+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*25+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 25 | (uint64_t)SRC1(ip, i*64+2) << 50;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*25+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 14;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*25+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 11;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*25+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 36 | (uint64_t)SRC1(ip, i*64+5) << 61;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*25+ 2) = (uint32_t)SRC(ip, i*64+ 5) >> 3;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*25+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 22 | (uint64_t)SRC1(ip, i*64+7) << 47;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*25+ 3) = (uint32_t)SRC(ip, i*64+ 7) >> 17;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*64+ 8) << 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*64+ 9) << 33 | (uint64_t)SRC1(ip, i*64+10) << 58;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*25+ 4) = (uint32_t)SRC(ip, i*64+10) >> 6;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*25+ 4) |= (uint64_t)SRC(ip, i*64+11) << 19 | (uint64_t)SRC1(ip, i*64+12) << 44;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*25+ 5) = (uint32_t)SRC(ip, i*64+12) >> 20;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*25+ 5) |= (uint32_t)SRC(ip, i*64+13) << 5;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*25+ 5) |= (uint64_t)SRC(ip, i*64+14) << 30 | (uint64_t)SRC1(ip, i*64+15) << 55;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*25+ 6) = (uint32_t)SRC(ip, i*64+15) >> 9;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*25+ 6) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 41;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*25+ 7) = (uint32_t)SRC(ip, i*64+17) >> 23;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*25+ 7) |= (uint32_t)SRC(ip, i*64+18) << 2;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*25+ 7) |= (uint64_t)SRC(ip, i*64+19) << 27 | (uint64_t)SRC1(ip, i*64+20) << 52;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*25+ 8) = (uint32_t)SRC(ip, i*64+20) >> 12;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*25+ 8) |= (uint64_t)SRC(ip, i*64+21) << 13;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*25+ 8) |= (uint64_t)SRC(ip, i*64+22) << 38 | (uint64_t)SRC1(ip, i*64+23) << 63;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*25+ 9) = (uint32_t)SRC(ip, i*64+23) >> 1;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*25+ 9) |= (uint64_t)SRC(ip, i*64+24) << 24 | (uint64_t)SRC1(ip, i*64+25) << 49;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*25+10) = (uint32_t)SRC(ip, i*64+25) >> 15;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*64+26) << 10;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*64+27) << 35 | (uint64_t)SRC1(ip, i*64+28) << 60;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*25+11) = (uint32_t)SRC(ip, i*64+28) >> 4;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*25+11) |= (uint64_t)SRC(ip, i*64+29) << 21 | (uint64_t)SRC1(ip, i*64+30) << 46;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*25+12) = (uint32_t)SRC(ip, i*64+30) >> 18;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*25+12) |= (uint32_t)SRC(ip, i*64+31) << 7;\ +} + +#define BITPACK64_25(ip, op, parm) { \ + BITBLK64_25(ip, 0, op, parm); SRCI(ip); op += 25*4/sizeof(op[0]);\ +} + +#define BITBLK64_26(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 26 | (uint64_t)SRC1(ip, i*32+2) << 52;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*13+ 1) = (uint32_t)SRC(ip, i*32+ 2) >> 12;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 14 | (uint64_t)SRC1(ip, i*32+4) << 40;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*13+ 2) = (uint32_t)SRC(ip, i*32+ 4) >> 24;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*13+ 2) |= (uint32_t)SRC(ip, i*32+ 5) << 2;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*32+ 6) << 28 | (uint64_t)SRC1(ip, i*32+7) << 54;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*13+ 3) = (uint32_t)SRC(ip, i*32+ 7) >> 10;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*32+ 8) << 16 | (uint64_t)SRC1(ip, i*32+9) << 42;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*13+ 4) = (uint32_t)SRC(ip, i*32+ 9) >> 22;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*13+ 4) |= (uint32_t)SRC(ip, i*32+10) << 4;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*32+11) << 30 | (uint64_t)SRC1(ip, i*32+12) << 56;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*13+ 5) = (uint32_t)SRC(ip, i*32+12) >> 8;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*32+13) << 18 | (uint64_t)SRC1(ip, i*32+14) << 44;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*13+ 6) = (uint32_t)SRC(ip, i*32+14) >> 20;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*13+ 6) |= (uint32_t)SRC(ip, i*32+15) << 6;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*13+ 6) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 58;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*13+ 7) = (uint32_t)SRC(ip, i*32+17) >> 6;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*13+ 7) |= (uint64_t)SRC(ip, i*32+18) << 20 | (uint64_t)SRC1(ip, i*32+19) << 46;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*13+ 8) = (uint32_t)SRC(ip, i*32+19) >> 18;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*32+20) << 8;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*32+21) << 34 | (uint64_t)SRC1(ip, i*32+22) << 60;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*13+ 9) = (uint32_t)SRC(ip, i*32+22) >> 4;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*13+ 9) |= (uint64_t)SRC(ip, i*32+23) << 22 | (uint64_t)SRC1(ip, i*32+24) << 48;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*13+10) = (uint32_t)SRC(ip, i*32+24) >> 16;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*13+10) |= (uint64_t)SRC(ip, i*32+25) << 10;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*13+10) |= (uint64_t)SRC(ip, i*32+26) << 36 | (uint64_t)SRC1(ip, i*32+27) << 62;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*13+11) = (uint32_t)SRC(ip, i*32+27) >> 2;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*13+11) |= (uint64_t)SRC(ip, i*32+28) << 24 | (uint64_t)SRC1(ip, i*32+29) << 50;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*13+12) = (uint32_t)SRC(ip, i*32+29) >> 14;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*32+30) << 12;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*32+31) << 38;\ +} + +#define BITPACK64_26(ip, op, parm) { \ + BITBLK64_26(ip, 0, op, parm); SRCI(ip); op += 26*4/sizeof(op[0]);\ +} + +#define BITBLK64_27(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*27+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*27+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 27 | (uint64_t)SRC1(ip, i*64+2) << 54;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*27+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 10;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*27+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 17 | (uint64_t)SRC1(ip, i*64+4) << 44;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*27+ 2) = (uint32_t)SRC(ip, i*64+ 4) >> 20;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*27+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 7;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*27+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 34 | (uint64_t)SRC1(ip, i*64+7) << 61;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*27+ 3) = (uint32_t)SRC(ip, i*64+ 7) >> 3;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*27+ 3) |= (uint64_t)SRC(ip, i*64+ 8) << 24 | (uint64_t)SRC1(ip, i*64+9) << 51;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*27+ 4) = (uint32_t)SRC(ip, i*64+ 9) >> 13;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*27+ 4) |= (uint64_t)SRC(ip, i*64+10) << 14 | (uint64_t)SRC1(ip, i*64+11) << 41;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*27+ 5) = (uint32_t)SRC(ip, i*64+11) >> 23;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*27+ 5) |= (uint32_t)SRC(ip, i*64+12) << 4;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*27+ 5) |= (uint64_t)SRC(ip, i*64+13) << 31 | (uint64_t)SRC1(ip, i*64+14) << 58;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*27+ 6) = (uint32_t)SRC(ip, i*64+14) >> 6;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*27+ 6) |= (uint64_t)SRC(ip, i*64+15) << 21 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*27+ 7) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*27+ 7) |= (uint64_t)SRC(ip, i*64+17) << 11 | (uint64_t)SRC1(ip, i*64+18) << 38;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*27+ 8) = (uint32_t)SRC(ip, i*64+18) >> 26;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*27+ 8) |= (uint32_t)SRC(ip, i*64+19) << 1;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*27+ 8) |= (uint64_t)SRC(ip, i*64+20) << 28 | (uint64_t)SRC1(ip, i*64+21) << 55;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*27+ 9) = (uint32_t)SRC(ip, i*64+21) >> 9;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*27+ 9) |= (uint64_t)SRC(ip, i*64+22) << 18 | (uint64_t)SRC1(ip, i*64+23) << 45;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*27+10) = (uint32_t)SRC(ip, i*64+23) >> 19;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*64+24) << 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*64+25) << 35 | (uint64_t)SRC1(ip, i*64+26) << 62;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*27+11) = (uint32_t)SRC(ip, i*64+26) >> 2;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*27+11) |= (uint64_t)SRC(ip, i*64+27) << 25 | (uint64_t)SRC1(ip, i*64+28) << 52;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*27+12) = (uint32_t)SRC(ip, i*64+28) >> 12;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*27+12) |= (uint64_t)SRC(ip, i*64+29) << 15 | (uint64_t)SRC1(ip, i*64+30) << 42;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*27+13) = (uint32_t)SRC(ip, i*64+30) >> 22;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*27+13) |= (uint32_t)SRC(ip, i*64+31) << 5;\ +} + +#define BITPACK64_27(ip, op, parm) { \ + BITBLK64_27(ip, 0, op, parm); SRCI(ip); op += 27*4/sizeof(op[0]);\ +} + +#define BITBLK64_28(ip, i, op, parm) { ;\ + IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint32_t)SRC(ip, i*16+ 0) ;\ + IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 28 | (uint64_t)SRC1(ip, i*16+2) << 56;\ + IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*7+ 1) = (uint32_t)SRC(ip, i*16+ 2) >> 8;\ + IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*16+ 3) << 20 | (uint64_t)SRC1(ip, i*16+4) << 48;\ + IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*7+ 2) = (uint32_t)SRC(ip, i*16+ 4) >> 16;\ + IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*16+ 5) << 12 | (uint64_t)SRC1(ip, i*16+6) << 40;\ + IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*7+ 3) = (uint32_t)SRC(ip, i*16+ 6) >> 24;\ + IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*7+ 3) |= (uint32_t)SRC(ip, i*16+ 7) << 4;\ + IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*16+ 8) << 32 | (uint64_t)SRC1(ip, i*16+9) << 60;\ + IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*7+ 4) = (uint32_t)SRC(ip, i*16+ 9) >> 4;\ + IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*16+10) << 24 | (uint64_t)SRC1(ip, i*16+11) << 52;\ + IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*7+ 5) = (uint32_t)SRC(ip, i*16+11) >> 12;\ + IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*16+12) << 16 | (uint64_t)SRC1(ip, i*16+13) << 44;\ + IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*7+ 6) = (uint32_t)SRC(ip, i*16+13) >> 20;\ + IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*16+14) << 8;\ + IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*16+15) << 36;\ +} + +#define BITPACK64_28(ip, op, parm) { \ + BITBLK64_28(ip, 0, op, parm);\ + BITBLK64_28(ip, 1, op, parm); SRCI(ip); op += 28*4/sizeof(op[0]);\ +} + +#define BITBLK64_29(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*29+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*29+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 29 | (uint64_t)SRC1(ip, i*64+2) << 58;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*29+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 6;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*29+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 23 | (uint64_t)SRC1(ip, i*64+4) << 52;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*29+ 2) = (uint32_t)SRC(ip, i*64+ 4) >> 12;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*29+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 17 | (uint64_t)SRC1(ip, i*64+6) << 46;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*29+ 3) = (uint32_t)SRC(ip, i*64+ 6) >> 18;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*29+ 3) |= (uint64_t)SRC(ip, i*64+ 7) << 11 | (uint64_t)SRC1(ip, i*64+8) << 40;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*29+ 4) = (uint32_t)SRC(ip, i*64+ 8) >> 24;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*29+ 4) |= (uint64_t)SRC(ip, i*64+ 9) << 5;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*29+ 4) |= (uint64_t)SRC(ip, i*64+10) << 34 | (uint64_t)SRC1(ip, i*64+11) << 63;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*29+ 5) = (uint32_t)SRC(ip, i*64+11) >> 1;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*29+ 5) |= (uint64_t)SRC(ip, i*64+12) << 28 | (uint64_t)SRC1(ip, i*64+13) << 57;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*29+ 6) = (uint32_t)SRC(ip, i*64+13) >> 7;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*29+ 6) |= (uint64_t)SRC(ip, i*64+14) << 22 | (uint64_t)SRC1(ip, i*64+15) << 51;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*29+ 7) = (uint32_t)SRC(ip, i*64+15) >> 13;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*29+ 7) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 45;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*29+ 8) = (uint32_t)SRC(ip, i*64+17) >> 19;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*29+ 8) |= (uint64_t)SRC(ip, i*64+18) << 10 | (uint64_t)SRC1(ip, i*64+19) << 39;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*29+ 9) = (uint32_t)SRC(ip, i*64+19) >> 25;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*64+20) << 4;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*64+21) << 33 | (uint64_t)SRC1(ip, i*64+22) << 62;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*29+10) = (uint32_t)SRC(ip, i*64+22) >> 2;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*29+10) |= (uint64_t)SRC(ip, i*64+23) << 27 | (uint64_t)SRC1(ip, i*64+24) << 56;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*29+11) = (uint32_t)SRC(ip, i*64+24) >> 8;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*29+11) |= (uint64_t)SRC(ip, i*64+25) << 21 | (uint64_t)SRC1(ip, i*64+26) << 50;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*29+12) = (uint32_t)SRC(ip, i*64+26) >> 14;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*29+12) |= (uint64_t)SRC(ip, i*64+27) << 15 | (uint64_t)SRC1(ip, i*64+28) << 44;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*29+13) = (uint32_t)SRC(ip, i*64+28) >> 20;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*29+13) |= (uint64_t)SRC(ip, i*64+29) << 9 | (uint64_t)SRC1(ip, i*64+30) << 38;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*29+14) = (uint32_t)SRC(ip, i*64+30) >> 26;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*29+14) |= (uint32_t)SRC(ip, i*64+31) << 3;\ +} + +#define BITPACK64_29(ip, op, parm) { \ + BITBLK64_29(ip, 0, op, parm); SRCI(ip); op += 29*4/sizeof(op[0]);\ +} + +#define BITBLK64_30(ip, i, op, parm) { ;\ + IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint32_t)SRC(ip, i*32+ 0) ;\ + IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 30 | (uint64_t)SRC1(ip, i*32+2) << 60;\ + IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*15+ 1) = (uint32_t)SRC(ip, i*32+ 2) >> 4;\ + IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 26 | (uint64_t)SRC1(ip, i*32+4) << 56;\ + IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*15+ 2) = (uint32_t)SRC(ip, i*32+ 4) >> 8;\ + IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*32+ 5) << 22 | (uint64_t)SRC1(ip, i*32+6) << 52;\ + IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*15+ 3) = (uint32_t)SRC(ip, i*32+ 6) >> 12;\ + IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*32+ 7) << 18 | (uint64_t)SRC1(ip, i*32+8) << 48;\ + IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*15+ 4) = (uint32_t)SRC(ip, i*32+ 8) >> 16;\ + IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*32+ 9) << 14 | (uint64_t)SRC1(ip, i*32+10) << 44;\ + IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*15+ 5) = (uint32_t)SRC(ip, i*32+10) >> 20;\ + IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*32+11) << 10 | (uint64_t)SRC1(ip, i*32+12) << 40;\ + IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*15+ 6) = (uint32_t)SRC(ip, i*32+12) >> 24;\ + IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*32+13) << 6 | (uint64_t)SRC1(ip, i*32+14) << 36;\ + IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*15+ 7) = (uint32_t)SRC(ip, i*32+14) >> 28;\ + IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*15+ 7) |= (uint32_t)SRC(ip, i*32+15) << 2;\ + IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*15+ 7) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 62;\ + IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*15+ 8) = (uint32_t)SRC(ip, i*32+17) >> 2;\ + IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*15+ 8) |= (uint64_t)SRC(ip, i*32+18) << 28 | (uint64_t)SRC1(ip, i*32+19) << 58;\ + IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*15+ 9) = (uint32_t)SRC(ip, i*32+19) >> 6;\ + IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*15+ 9) |= (uint64_t)SRC(ip, i*32+20) << 24 | (uint64_t)SRC1(ip, i*32+21) << 54;\ + IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*15+10) = (uint32_t)SRC(ip, i*32+21) >> 10;\ + IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*15+10) |= (uint64_t)SRC(ip, i*32+22) << 20 | (uint64_t)SRC1(ip, i*32+23) << 50;\ + IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*15+11) = (uint32_t)SRC(ip, i*32+23) >> 14;\ + IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*15+11) |= (uint64_t)SRC(ip, i*32+24) << 16 | (uint64_t)SRC1(ip, i*32+25) << 46;\ + IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*15+12) = (uint32_t)SRC(ip, i*32+25) >> 18;\ + IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*15+12) |= (uint64_t)SRC(ip, i*32+26) << 12 | (uint64_t)SRC1(ip, i*32+27) << 42;\ + IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*15+13) = (uint32_t)SRC(ip, i*32+27) >> 22;\ + IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*15+13) |= (uint64_t)SRC(ip, i*32+28) << 8 | (uint64_t)SRC1(ip, i*32+29) << 38;\ + IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*15+14) = (uint32_t)SRC(ip, i*32+29) >> 26;\ + IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*32+30) << 4;\ + IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*32+31) << 34;\ +} + +#define BITPACK64_30(ip, op, parm) { \ + BITBLK64_30(ip, 0, op, parm); SRCI(ip); op += 30*4/sizeof(op[0]);\ +} + +#define BITBLK64_31(ip, i, op, parm) { ;\ + IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*31+ 0) = (uint32_t)SRC(ip, i*64+ 0) ;\ + IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*31+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 31 | (uint64_t)SRC1(ip, i*64+2) << 62;\ + IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*31+ 1) = (uint32_t)SRC(ip, i*64+ 2) >> 2;\ + IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*31+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 29 | (uint64_t)SRC1(ip, i*64+4) << 60;\ + IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*31+ 2) = (uint32_t)SRC(ip, i*64+ 4) >> 4;\ + IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*31+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 27 | (uint64_t)SRC1(ip, i*64+6) << 58;\ + IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*31+ 3) = (uint32_t)SRC(ip, i*64+ 6) >> 6;\ + IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*31+ 3) |= (uint64_t)SRC(ip, i*64+ 7) << 25 | (uint64_t)SRC1(ip, i*64+8) << 56;\ + IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*31+ 4) = (uint32_t)SRC(ip, i*64+ 8) >> 8;\ + IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*31+ 4) |= (uint64_t)SRC(ip, i*64+ 9) << 23 | (uint64_t)SRC1(ip, i*64+10) << 54;\ + IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*31+ 5) = (uint32_t)SRC(ip, i*64+10) >> 10;\ + IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*31+ 5) |= (uint64_t)SRC(ip, i*64+11) << 21 | (uint64_t)SRC1(ip, i*64+12) << 52;\ + IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*31+ 6) = (uint32_t)SRC(ip, i*64+12) >> 12;\ + IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*31+ 6) |= (uint64_t)SRC(ip, i*64+13) << 19 | (uint64_t)SRC1(ip, i*64+14) << 50;\ + IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*31+ 7) = (uint32_t)SRC(ip, i*64+14) >> 14;\ + IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*31+ 7) |= (uint64_t)SRC(ip, i*64+15) << 17 | (uint64_t)SRC1(ip, i*64+16) << 48;\ + IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*31+ 8) = (uint32_t)SRC(ip, i*64+16) >> 16;\ + IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*31+ 8) |= (uint64_t)SRC(ip, i*64+17) << 15 | (uint64_t)SRC1(ip, i*64+18) << 46;\ + IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*31+ 9) = (uint32_t)SRC(ip, i*64+18) >> 18;\ + IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*31+ 9) |= (uint64_t)SRC(ip, i*64+19) << 13 | (uint64_t)SRC1(ip, i*64+20) << 44;\ + IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*31+10) = (uint32_t)SRC(ip, i*64+20) >> 20;\ + IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*31+10) |= (uint64_t)SRC(ip, i*64+21) << 11 | (uint64_t)SRC1(ip, i*64+22) << 42;\ + IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*31+11) = (uint32_t)SRC(ip, i*64+22) >> 22;\ + IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*31+11) |= (uint64_t)SRC(ip, i*64+23) << 9 | (uint64_t)SRC1(ip, i*64+24) << 40;\ + IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*31+12) = (uint32_t)SRC(ip, i*64+24) >> 24;\ + IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*31+12) |= (uint64_t)SRC(ip, i*64+25) << 7 | (uint64_t)SRC1(ip, i*64+26) << 38;\ + IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*31+13) = (uint32_t)SRC(ip, i*64+26) >> 26;\ + IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*31+13) |= (uint64_t)SRC(ip, i*64+27) << 5 | (uint64_t)SRC1(ip, i*64+28) << 36;\ + IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*31+14) = (uint32_t)SRC(ip, i*64+28) >> 28;\ + IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*31+14) |= (uint64_t)SRC(ip, i*64+29) << 3 | (uint64_t)SRC1(ip, i*64+30) << 34;\ + IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*31+15) = (uint32_t)SRC(ip, i*64+30) >> 30;\ + IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*31+15) |= (uint32_t)SRC(ip, i*64+31) << 1;\ +} + +#define BITPACK64_31(ip, op, parm) { \ + BITBLK64_31(ip, 0, op, parm); SRCI(ip); op += 31*4/sizeof(op[0]);\ +} + +#define BITBLK64_32(ip, i, op, parm) { \ + IPPB(ip, i*2+ 0, parm); *(uint32_t *)(op+i*8+ 0) = SRC(ip, i*2+ 0);\ + IPPB(ip, i*2+ 1, parm); *(uint32_t *)(op+i*8+ 4) = SRC(ip, i*2+ 1);;\ +} + +#define BITPACK64_32(ip, op, parm) { \ + BITBLK64_32(ip, 0, op, parm);\ + BITBLK64_32(ip, 1, op, parm);\ + BITBLK64_32(ip, 2, op, parm);\ + BITBLK64_32(ip, 3, op, parm);\ + BITBLK64_32(ip, 4, op, parm);\ + BITBLK64_32(ip, 5, op, parm);\ + BITBLK64_32(ip, 6, op, parm);\ + BITBLK64_32(ip, 7, op, parm);\ + BITBLK64_32(ip, 8, op, parm);\ + BITBLK64_32(ip, 9, op, parm);\ + BITBLK64_32(ip, 10, op, parm);\ + BITBLK64_32(ip, 11, op, parm);\ + BITBLK64_32(ip, 12, op, parm);\ + BITBLK64_32(ip, 13, op, parm);\ + BITBLK64_32(ip, 14, op, parm);\ + BITBLK64_32(ip, 15, op, parm); SRCI(ip); op += 32*4/sizeof(op[0]);\ +} + diff --git a/bitpack_.h b/bitpack_.h new file mode 100644 index 0000000..3fcabd2 --- /dev/null +++ b/bitpack_.h @@ -0,0 +1,200 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitpack_.h - "Integer Compression" binary packing +**/ + +#include +#define USE_BITPACK 64 +#if 0 +#define SRCI(__ip) __ip+=32 +#define SRC(__ip,__x) __ip[__x] +#define SRC1(__ip,__x) __ip[__x] +//#define SRCP( __ip) +#else +#define SRCI(__ip) +#define SRC1(__ip,__x) (*(__ip/*+1*/)) +#define SRC( __ip,__x) (*__ip++) +//#define SRCP( __ip) (__ip++) +#endif + + #if USE_BITPACK == 64 +#include "bitpack64_.h" +#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0:__ip = __ipe; break;\ + case 1:do BITPACK64_1( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 2:do BITPACK64_2( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 3:do BITPACK64_3( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 4:do BITPACK64_4( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 5:do BITPACK64_5( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 6:do BITPACK64_6( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 7:do BITPACK64_7( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 8:do BITPACK64_8( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 9:do BITPACK64_9( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 10:do BITPACK64_10(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 11:do BITPACK64_11(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 12:do BITPACK64_12(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 13:do BITPACK64_13(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 14:do BITPACK64_14(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 15:do BITPACK64_15(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 16:do BITPACK64_16(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 17:do BITPACK64_17(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 18:do BITPACK64_18(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 19:do BITPACK64_19(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 20:do BITPACK64_20(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 21:do BITPACK64_21(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 22:do BITPACK64_22(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 23:do BITPACK64_23(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 24:do BITPACK64_24(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 25:do BITPACK64_25(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 26:do BITPACK64_26(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 27:do BITPACK64_27(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 28:do BITPACK64_28(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 29:do BITPACK64_29(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 30:do BITPACK64_30(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 31:do BITPACK64_31(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 32:do BITPACK64_32(__ip, __op, __parm) while(__ip < __ipe);\ + }\ +} while(0) + #elif USE_BITPACK == 32 +#include "bitpack32_.h" +#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0:__ip = __ipe; break;\ + case 1:do BITPACK32_1( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 2:do BITPACK32_2( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 3:do BITPACK32_3( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 4:do BITPACK32_4( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 5:do BITPACK32_5( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 6:do BITPACK32_6( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 7:do BITPACK32_7( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 8:do BITPACK32_8( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 9:do BITPACK32_9( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 10:do BITPACK32_10(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 11:do BITPACK32_11(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 12:do BITPACK32_12(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 13:do BITPACK32_13(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 14:do BITPACK32_14(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 15:do BITPACK32_15(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 16:do BITPACK32_16(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 17:do BITPACK32_17(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 18:do BITPACK32_18(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 19:do BITPACK32_19(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 20:do BITPACK32_20(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 21:do BITPACK32_21(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 22:do BITPACK32_22(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 23:do BITPACK32_23(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 24:do BITPACK32_24(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 25:do BITPACK32_25(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 26:do BITPACK32_26(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 27:do BITPACK32_27(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 28:do BITPACK32_28(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 29:do BITPACK32_29(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 30:do BITPACK32_30(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 31:do BITPACK32_31(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 32:do BITPACK32_32(__ip, __op, __parm) while(__ip < __ipe);\ + } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\ +} while(0) + #else + #if 1 +#define SRCI(__ip) __ip+=32 +#define SRC(__ip,__x) __ip[__x] +#define SRCP( __ip) + #else +#define SRCI(__ip) +#define SRC( __ip,__x) (*__ip++) +//#define SRCP( __ip) (__ip++) + #endif +#include "pack/bitpack32_1.h" +#include "pack/bitpack32_2.h" +#include "pack/bitpack32_3.h" +#include "pack/bitpack32_4.h" +#include "pack/bitpack32_5.h" +#include "pack/bitpack32_6.h" +#include "pack/bitpack32_7.h" +#include "pack/bitpack32_8.h" +#include "pack/bitpack32_9.h" +#include "pack/bitpack32_10.h" +#include "pack/bitpack32_11.h" +#include "pack/bitpack32_12.h" +#include "pack/bitpack32_13.h" +#include "pack/bitpack32_14.h" +#include "pack/bitpack32_15.h" +#include "pack/bitpack32_16.h" +#include "pack/bitpack32_17.h" +#include "pack/bitpack32_18.h" +#include "pack/bitpack32_19.h" +#include "pack/bitpack32_20.h" +#include "pack/bitpack32_21.h" +#include "pack/bitpack32_22.h" +#include "pack/bitpack32_23.h" +#include "pack/bitpack32_24.h" +#include "pack/bitpack32_25.h" +#include "pack/bitpack32_26.h" +#include "pack/bitpack32_27.h" +#include "pack/bitpack32_28.h" +#include "pack/bitpack32_29.h" +#include "pack/bitpack32_30.h" +#include "pack/bitpack32_31.h" +#include "pack/bitpack32_32.h" +#define BITPACK32(__ip, __n, __nbits, __op, __parm) do { typeof(__ip[0]) *__ipe=(__ip)+(__n);/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0:__ip = __ipe; break;\ + case 1:do BITPACK_1( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 2:do BITPACK_2( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 3:do BITPACK_3( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 4:do BITPACK_4( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 5:do BITPACK_5( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 6:do BITPACK_6( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 7:do BITPACK_7( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 8:do BITPACK_8( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 9:do BITPACK_9( __ip, __op, __parm) while(__ip < __ipe); break;\ + case 10:do BITPACK_10(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 11:do BITPACK_11(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 12:do BITPACK_12(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 13:do BITPACK_13(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 14:do BITPACK_14(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 15:do BITPACK_15(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 16:do BITPACK_16(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 17:do BITPACK_17(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 18:do BITPACK_18(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 19:do BITPACK_19(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 20:do BITPACK_20(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 21:do BITPACK_21(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 22:do BITPACK_22(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 23:do BITPACK_23(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 24:do BITPACK_24(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 25:do BITPACK_25(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 26:do BITPACK_26(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 27:do BITPACK_27(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 28:do BITPACK_28(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 29:do BITPACK_29(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 30:do BITPACK_30(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 31:do BITPACK_31(__ip, __op, __parm) while(__ip < __ipe); break;\ + case 32:do BITPACK_32(__ip, __op, __parm) while(__ip < __ipe);\ + } /*printf("p=%d,%d,%d ", __n, __ip - __ipe, __ip - sc);*/\ +} while(0) + #endif +// + diff --git a/bitunpack.c b/bitunpack.c new file mode 100644 index 0000000..830ad4b --- /dev/null +++ b/bitunpack.c @@ -0,0 +1,56 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack_.h - "Integer Compression" binary packing +**/ + +#include "bitunpack.h" + +#define PAD8(__x) (((__x)+7)/8) +unsigned char * bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i; for(i=0; i < n; i++ ) out[i] = bitgetx32(in, b, i); return in + PAD8(n*b); } +unsigned char *_bitunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned i,k=0; for(i=0; i < n; i++,k+=b ) *out++ = _bitgetx32(in, b, k); return in + PAD8(n*b); } + +#define BPI(__w,__parm) __w +#include "bitunpack_.h" +unsigned char *bitunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; } +unsigned char *bitunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, unsigned short *__restrict__ out) { unsigned char *pin = in+PAD8(n*b); BITUNPACK32(in, n, b, out, 0); return pin; } +#undef BPI + +//------------------------------------------------------------------------------------------ +#define BPI(__w,__parm) (__parm += (__w) + 1) +#include "bitunpack_.h" + +unsigned char *bitdunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +unsigned char *bitdunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = (start += bitgetx32(in, b, i)+1); return in + PAD8(n*b); } +unsigned char *bitdunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +#undef BPI + +//------------------------------------------------------------------------------------------ +#define BPI(__w,__parm) (__parm + (__w) + 1) +#include "bitunpack_.h" + +unsigned char *bitfunpack32( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +unsigned char *bitfunpackx32(unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned *__restrict__ out) { int i; for(i = 0; i < n; i++) out[i] = bitgetx32(in, b, i)+start+1; return in + PAD8(n*b); } +unsigned char *bitfunpack16( unsigned char *__restrict__ in, unsigned n, unsigned b, int start, unsigned short *__restrict__ out) { unsigned char *pin=in+PAD8(n*b); BITUNPACK32(in, n, b, out, start); return pin; } +#undef BPI + diff --git a/bitunpack.h b/bitunpack.h new file mode 100644 index 0000000..ff1054c --- /dev/null +++ b/bitunpack.h @@ -0,0 +1,51 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack.h - "Integer Compression" binary packing +**/ + +// BP +static inline unsigned bitgetx32(unsigned *__restrict__ in, unsigned b, unsigned idx) { unsigned bidx = b*idx; return ((*(unsigned long long *)(in+(bidx>>5))) >> (bidx&0x1f)) & ((1ull<>5))) >> (bidx&0x1f)) & ((1ull<>4))) >> (bidx& 0xf)) & ((1 <>4))) >> (bidx& 0xf)) & ((1 <= val) { *oidx=idx; return oval; } } return INT_MAX; } + diff --git a/bitunpack64_.h b/bitunpack64_.h new file mode 100644 index 0000000..88ac332 --- /dev/null +++ b/bitunpack64_.h @@ -0,0 +1,1365 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack64_.c - "Integer Compression" binary packing +**/ + +#define BITUNBLK32_0(ip, i, op, parm) { \ + DST(op,i*0+ 0, 0, parm);\ + DST(op,i*0+ 1, 0, parm);\ + DST(op,i*0+ 2, 0, parm);\ + DST(op,i*0+ 3, 0, parm);\ + DST(op,i*0+ 4, 0, parm);\ + DST(op,i*0+ 5, 0, parm);\ + DST(op,i*0+ 6, 0, parm);\ + DST(op,i*0+ 7, 0, parm);\ + DST(op,i*0+ 8, 0, parm);\ + DST(op,i*0+ 9, 0, parm);\ + DST(op,i*0+10, 0, parm);\ + DST(op,i*0+11, 0, parm);\ + DST(op,i*0+12, 0, parm);\ + DST(op,i*0+13, 0, parm);\ + DST(op,i*0+14, 0, parm);\ + DST(op,i*0+15, 0, parm);\ + DST(op,i*0+16, 0, parm);\ + DST(op,i*0+17, 0, parm);\ + DST(op,i*0+18, 0, parm);\ + DST(op,i*0+19, 0, parm);\ + DST(op,i*0+20, 0, parm);\ + DST(op,i*0+21, 0, parm);\ + DST(op,i*0+22, 0, parm);\ + DST(op,i*0+23, 0, parm);\ + DST(op,i*0+24, 0, parm);\ + DST(op,i*0+25, 0, parm);\ + DST(op,i*0+26, 0, parm);\ + DST(op,i*0+27, 0, parm);\ + DST(op,i*0+28, 0, parm);\ + DST(op,i*0+29, 0, parm);\ + DST(op,i*0+30, 0, parm);\ + DST(op,i*0+31, 0, parm);;\ +} + +#define BITUNPACK64_0(ip, op, parm) { \ + BITUNBLK32_0(ip, 0, op, parm); DSTI(op);\ +} + +#define BITUNBLK32_1(ip, i, op, parm) { register uint32_t w0 = *(uint32_t *)(ip+(i*1+0)*4/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x1, parm);\ + DST(op,i*32+ 1, (w0 >> 1) & 0x1, parm);\ + DST(op,i*32+ 2, (w0 >> 2) & 0x1, parm);\ + DST(op,i*32+ 3, (w0 >> 3) & 0x1, parm);\ + DST(op,i*32+ 4, (w0 >> 4) & 0x1, parm);\ + DST(op,i*32+ 5, (w0 >> 5) & 0x1, parm);\ + DST(op,i*32+ 6, (w0 >> 6) & 0x1, parm);\ + DST(op,i*32+ 7, (w0 >> 7) & 0x1, parm);\ + DST(op,i*32+ 8, (w0 >> 8) & 0x1, parm);\ + DST(op,i*32+ 9, (w0 >> 9) & 0x1, parm);\ + DST(op,i*32+10, (w0 >> 10) & 0x1, parm);\ + DST(op,i*32+11, (w0 >> 11) & 0x1, parm);\ + DST(op,i*32+12, (w0 >> 12) & 0x1, parm);\ + DST(op,i*32+13, (w0 >> 13) & 0x1, parm);\ + DST(op,i*32+14, (w0 >> 14) & 0x1, parm);\ + DST(op,i*32+15, (w0 >> 15) & 0x1, parm);\ + DST(op,i*32+16, (w0 >> 16) & 0x1, parm);\ + DST(op,i*32+17, (w0 >> 17) & 0x1, parm);\ + DST(op,i*32+18, (w0 >> 18) & 0x1, parm);\ + DST(op,i*32+19, (w0 >> 19) & 0x1, parm);\ + DST(op,i*32+20, (w0 >> 20) & 0x1, parm);\ + DST(op,i*32+21, (w0 >> 21) & 0x1, parm);\ + DST(op,i*32+22, (w0 >> 22) & 0x1, parm);\ + DST(op,i*32+23, (w0 >> 23) & 0x1, parm);\ + DST(op,i*32+24, (w0 >> 24) & 0x1, parm);\ + DST(op,i*32+25, (w0 >> 25) & 0x1, parm);\ + DST(op,i*32+26, (w0 >> 26) & 0x1, parm);\ + DST(op,i*32+27, (w0 >> 27) & 0x1, parm);\ + DST(op,i*32+28, (w0 >> 28) & 0x1, parm);\ + DST(op,i*32+29, (w0 >> 29) & 0x1, parm);\ + DST(op,i*32+30, (w0 >> 30) & 0x1, parm);\ + DST(op,i*32+31, (w0 >> 31) , parm);;\ +} + +#define BITUNPACK64_1(ip, op, parm) { \ + BITUNBLK32_1(ip, 0, op, parm); DSTI(op); ip += 1*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_2(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3, parm);\ + DST(op,i*32+ 1, (w0 >> 2) & 0x3, parm);\ + DST(op,i*32+ 2, (w0 >> 4) & 0x3, parm);\ + DST(op,i*32+ 3, (w0 >> 6) & 0x3, parm);\ + DST(op,i*32+ 4, (w0 >> 8) & 0x3, parm);\ + DST(op,i*32+ 5, (w0 >> 10) & 0x3, parm);\ + DST(op,i*32+ 6, (w0 >> 12) & 0x3, parm);\ + DST(op,i*32+ 7, (w0 >> 14) & 0x3, parm);\ + DST(op,i*32+ 8, (w0 >> 16) & 0x3, parm);\ + DST(op,i*32+ 9, (w0 >> 18) & 0x3, parm);\ + DST(op,i*32+10, (w0 >> 20) & 0x3, parm);\ + DST(op,i*32+11, (w0 >> 22) & 0x3, parm);\ + DST(op,i*32+12, (w0 >> 24) & 0x3, parm);\ + DST(op,i*32+13, (w0 >> 26) & 0x3, parm);\ + DST(op,i*32+14, (w0 >> 28) & 0x3, parm);\ + DST(op,i*32+15, (w0 >> 30) & 0x3, parm);\ + DST(op,i*32+16, (w0 >> 32) & 0x3, parm);\ + DST(op,i*32+17, (w0 >> 34) & 0x3, parm);\ + DST(op,i*32+18, (w0 >> 36) & 0x3, parm);\ + DST(op,i*32+19, (w0 >> 38) & 0x3, parm);\ + DST(op,i*32+20, (w0 >> 40) & 0x3, parm);\ + DST(op,i*32+21, (w0 >> 42) & 0x3, parm);\ + DST(op,i*32+22, (w0 >> 44) & 0x3, parm);\ + DST(op,i*32+23, (w0 >> 46) & 0x3, parm);\ + DST(op,i*32+24, (w0 >> 48) & 0x3, parm);\ + DST(op,i*32+25, (w0 >> 50) & 0x3, parm);\ + DST(op,i*32+26, (w0 >> 52) & 0x3, parm);\ + DST(op,i*32+27, (w0 >> 54) & 0x3, parm);\ + DST(op,i*32+28, (w0 >> 56) & 0x3, parm);\ + DST(op,i*32+29, (w0 >> 58) & 0x3, parm);\ + DST(op,i*32+30, (w0 >> 60) & 0x3, parm);\ + DST(op,i*32+31, (w0 >> 62) , parm);;\ +} + +#define BITUNPACK64_2(ip, op, parm) { \ + BITUNBLK64_2(ip, 0, op, parm); DSTI(op); ip += 2*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));register uint32_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7, parm);\ + DST(op,i*64+ 1, (w0 >> 3) & 0x7, parm);\ + DST(op,i*64+ 2, (w0 >> 6) & 0x7, parm);\ + DST(op,i*64+ 3, (w0 >> 9) & 0x7, parm);\ + DST(op,i*64+ 4, (w0 >> 12) & 0x7, parm);\ + DST(op,i*64+ 5, (w0 >> 15) & 0x7, parm);\ + DST(op,i*64+ 6, (w0 >> 18) & 0x7, parm);\ + DST(op,i*64+ 7, (w0 >> 21) & 0x7, parm);\ + DST(op,i*64+ 8, (w0 >> 24) & 0x7, parm);\ + DST(op,i*64+ 9, (w0 >> 27) & 0x7, parm);\ + DST(op,i*64+10, (w0 >> 30) & 0x7, parm);\ + DST(op,i*64+11, (w0 >> 33) & 0x7, parm);\ + DST(op,i*64+12, (w0 >> 36) & 0x7, parm);\ + DST(op,i*64+13, (w0 >> 39) & 0x7, parm);\ + DST(op,i*64+14, (w0 >> 42) & 0x7, parm);\ + DST(op,i*64+15, (w0 >> 45) & 0x7, parm);\ + DST(op,i*64+16, (w0 >> 48) & 0x7, parm);\ + DST(op,i*64+17, (w0 >> 51) & 0x7, parm);\ + DST(op,i*64+18, (w0 >> 54) & 0x7, parm);\ + DST(op,i*64+19, (w0 >> 57) & 0x7, parm);\ + DST(op,i*64+20, (w0 >> 60) & 0x7, parm); \ +\ + DST(op,i*64+21, (w0 >> 63) | (w1 << 1) & 0x7, parm);\ + DST(op,i*64+22, (w1 >> 2) & 0x7, parm);\ + DST(op,i*64+23, (w1 >> 5) & 0x7, parm);\ + DST(op,i*64+24, (w1 >> 8) & 0x7, parm);\ + DST(op,i*64+25, (w1 >> 11) & 0x7, parm);\ + DST(op,i*64+26, (w1 >> 14) & 0x7, parm);\ + DST(op,i*64+27, (w1 >> 17) & 0x7, parm);\ + DST(op,i*64+28, (w1 >> 20) & 0x7, parm);\ + DST(op,i*64+29, (w1 >> 23) & 0x7, parm);\ + DST(op,i*64+30, (w1 >> 26) & 0x7, parm);\ + DST(op,i*64+31, (w1 >> 29) & 0x7, parm);;\ +} + +#define BITUNPACK64_3(ip, op, parm) { \ + BITUNBLK64_3(ip, 0, op, parm); DSTI(op); ip += 3*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip/*+(i*1+0)*8/sizeof(ip[0])*/);ip += 8/sizeof(ip[0]);\ + DST(op,i*16+ 0, (w0 ) & 0xf, parm);\ + DST(op,i*16+ 1, (w0 >> 4) & 0xf, parm);\ + DST(op,i*16+ 2, (w0 >> 8) & 0xf, parm);\ + DST(op,i*16+ 3, (w0 >> 12) & 0xf, parm);\ + DST(op,i*16+ 4, (w0 >> 16) & 0xf, parm);\ + DST(op,i*16+ 5, (w0 >> 20) & 0xf, parm);\ + DST(op,i*16+ 6, (w0 >> 24) & 0xf, parm);\ + DST(op,i*16+ 7, (w0 >> 28) & 0xf, parm);\ + DST(op,i*16+ 8, (w0 >> 32) & 0xf, parm);\ + DST(op,i*16+ 9, (w0 >> 36) & 0xf, parm);\ + DST(op,i*16+10, (w0 >> 40) & 0xf, parm);\ + DST(op,i*16+11, (w0 >> 44) & 0xf, parm);\ + DST(op,i*16+12, (w0 >> 48) & 0xf, parm);\ + DST(op,i*16+13, (w0 >> 52) & 0xf, parm);\ + DST(op,i*16+14, (w0 >> 56) & 0xf, parm);\ + DST(op,i*16+15, (w0 >> 60), parm);;\ +} + +#define BITUNPACK64_4(ip, op, parm) { \ + BITUNBLK64_4(ip, 0, op, parm);\ + BITUNBLK64_4(ip, 1, op, parm); DSTI(op); /*ip += 4*4/sizeof(ip[0]);*/\ +} + +#define BITUNBLK64_5(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1f, parm);\ + DST(op,i*64+ 1, (w0 >> 5) & 0x1f, parm);\ + DST(op,i*64+ 2, (w0 >> 10) & 0x1f, parm);\ + DST(op,i*64+ 3, (w0 >> 15) & 0x1f, parm);\ + DST(op,i*64+ 4, (w0 >> 20) & 0x1f, parm);\ + DST(op,i*64+ 5, (w0 >> 25) & 0x1f, parm);\ + DST(op,i*64+ 6, (w0 >> 30) & 0x1f, parm);\ + DST(op,i*64+ 7, (w0 >> 35) & 0x1f, parm);\ + DST(op,i*64+ 8, (w0 >> 40) & 0x1f, parm);\ + DST(op,i*64+ 9, (w0 >> 45) & 0x1f, parm);\ + DST(op,i*64+10, (w0 >> 50) & 0x1f, parm);\ + DST(op,i*64+11, (w0 >> 55) & 0x1f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0])); \ +\ + DST(op,i*64+12, (w0 >> 60) | (w1 << 4) & 0x1f, parm);\ + DST(op,i*64+13, (w1 >> 1) & 0x1f, parm);\ + DST(op,i*64+14, (w1 >> 6) & 0x1f, parm);\ + DST(op,i*64+15, (w1 >> 11) & 0x1f, parm);\ + DST(op,i*64+16, (w1 >> 16) & 0x1f, parm);\ + DST(op,i*64+17, (w1 >> 21) & 0x1f, parm);\ + DST(op,i*64+18, (w1 >> 26) & 0x1f, parm);\ + DST(op,i*64+19, (w1 >> 31) & 0x1f, parm);\ + DST(op,i*64+20, (w1 >> 36) & 0x1f, parm);\ + DST(op,i*64+21, (w1 >> 41) & 0x1f, parm);\ + DST(op,i*64+22, (w1 >> 46) & 0x1f, parm);\ + DST(op,i*64+23, (w1 >> 51) & 0x1f, parm);\ + DST(op,i*64+24, (w1 >> 56) & 0x1f, parm); register uint32_t w2 = *(uint32_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w1 >> 61) | (w2 << 3) & 0x1f, parm);\ + DST(op,i*64+26, (w2 >> 2) & 0x1f, parm);\ + DST(op,i*64+27, (w2 >> 7) & 0x1f, parm);\ + DST(op,i*64+28, (w2 >> 12) & 0x1f, parm);\ + DST(op,i*64+29, (w2 >> 17) & 0x1f, parm);\ + DST(op,i*64+30, (w2 >> 22) & 0x1f, parm);\ + DST(op,i*64+31, (w2 >> 27) & 0x1f, parm);;\ +} + +#define BITUNPACK64_5(ip, op, parm) { \ + BITUNBLK64_5(ip, 0, op, parm); DSTI(op); ip += 5*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_6(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3f, parm);\ + DST(op,i*32+ 1, (w0 >> 6) & 0x3f, parm);\ + DST(op,i*32+ 2, (w0 >> 12) & 0x3f, parm);\ + DST(op,i*32+ 3, (w0 >> 18) & 0x3f, parm);\ + DST(op,i*32+ 4, (w0 >> 24) & 0x3f, parm);\ + DST(op,i*32+ 5, (w0 >> 30) & 0x3f, parm);\ + DST(op,i*32+ 6, (w0 >> 36) & 0x3f, parm);\ + DST(op,i*32+ 7, (w0 >> 42) & 0x3f, parm);\ + DST(op,i*32+ 8, (w0 >> 48) & 0x3f, parm);\ + DST(op,i*32+ 9, (w0 >> 54) & 0x3f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+10, (w0 >> 60) | (w1 << 4) & 0x3f, parm);\ + DST(op,i*32+11, (w1 >> 2) & 0x3f, parm);\ + DST(op,i*32+12, (w1 >> 8) & 0x3f, parm);\ + DST(op,i*32+13, (w1 >> 14) & 0x3f, parm);\ + DST(op,i*32+14, (w1 >> 20) & 0x3f, parm);\ + DST(op,i*32+15, (w1 >> 26) & 0x3f, parm);\ + DST(op,i*32+16, (w1 >> 32) & 0x3f, parm);\ + DST(op,i*32+17, (w1 >> 38) & 0x3f, parm);\ + DST(op,i*32+18, (w1 >> 44) & 0x3f, parm);\ + DST(op,i*32+19, (w1 >> 50) & 0x3f, parm);\ + DST(op,i*32+20, (w1 >> 56) & 0x3f, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+21, (w1 >> 62) | (w2 << 2) & 0x3f, parm);\ + DST(op,i*32+22, (w2 >> 4) & 0x3f, parm);\ + DST(op,i*32+23, (w2 >> 10) & 0x3f, parm);\ + DST(op,i*32+24, (w2 >> 16) & 0x3f, parm);\ + DST(op,i*32+25, (w2 >> 22) & 0x3f, parm);\ + DST(op,i*32+26, (w2 >> 28) & 0x3f, parm);\ + DST(op,i*32+27, (w2 >> 34) & 0x3f, parm);\ + DST(op,i*32+28, (w2 >> 40) & 0x3f, parm);\ + DST(op,i*32+29, (w2 >> 46) & 0x3f, parm);\ + DST(op,i*32+30, (w2 >> 52) & 0x3f, parm);\ + DST(op,i*32+31, (w2 >> 58) , parm);;\ +} + +#define BITUNPACK64_6(ip, op, parm) { \ + BITUNBLK64_6(ip, 0, op, parm); DSTI(op); ip += 6*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_7(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7f, parm);\ + DST(op,i*64+ 1, (w0 >> 7) & 0x7f, parm);\ + DST(op,i*64+ 2, (w0 >> 14) & 0x7f, parm);\ + DST(op,i*64+ 3, (w0 >> 21) & 0x7f, parm);\ + DST(op,i*64+ 4, (w0 >> 28) & 0x7f, parm);\ + DST(op,i*64+ 5, (w0 >> 35) & 0x7f, parm);\ + DST(op,i*64+ 6, (w0 >> 42) & 0x7f, parm);\ + DST(op,i*64+ 7, (w0 >> 49) & 0x7f, parm);\ + DST(op,i*64+ 8, (w0 >> 56) & 0x7f, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w0 >> 63) | (w1 << 1) & 0x7f, parm);\ + DST(op,i*64+10, (w1 >> 6) & 0x7f, parm);\ + DST(op,i*64+11, (w1 >> 13) & 0x7f, parm);\ + DST(op,i*64+12, (w1 >> 20) & 0x7f, parm);\ + DST(op,i*64+13, (w1 >> 27) & 0x7f, parm);\ + DST(op,i*64+14, (w1 >> 34) & 0x7f, parm);\ + DST(op,i*64+15, (w1 >> 41) & 0x7f, parm);\ + DST(op,i*64+16, (w1 >> 48) & 0x7f, parm);\ + DST(op,i*64+17, (w1 >> 55) & 0x7f, parm); register uint32_t w3 = *(uint32_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w1 >> 62) | (w2 << 2) & 0x7f, parm);\ + DST(op,i*64+19, (w2 >> 5) & 0x7f, parm);\ + DST(op,i*64+20, (w2 >> 12) & 0x7f, parm);\ + DST(op,i*64+21, (w2 >> 19) & 0x7f, parm);\ + DST(op,i*64+22, (w2 >> 26) & 0x7f, parm);\ + DST(op,i*64+23, (w2 >> 33) & 0x7f, parm);\ + DST(op,i*64+24, (w2 >> 40) & 0x7f, parm);\ + DST(op,i*64+25, (w2 >> 47) & 0x7f, parm);\ + DST(op,i*64+26, (w2 >> 54) & 0x7f, parm); \ +\ + DST(op,i*64+27, (w2 >> 61) | (w3 << 3) & 0x7f, parm);\ + DST(op,i*64+28, (w3 >> 4) & 0x7f, parm);\ + DST(op,i*64+29, (w3 >> 11) & 0x7f, parm);\ + DST(op,i*64+30, (w3 >> 18) & 0x7f, parm);\ + DST(op,i*64+31, (w3 >> 25) & 0x7f, parm);;\ +} + +#define BITUNPACK64_7(ip, op, parm) { \ + BITUNBLK64_7(ip, 0, op, parm); DSTI(op); ip += 7*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_8(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ + DST(op,i*8+ 0, (w0 ) & 0xff, parm);\ + DST(op,i*8+ 1, (w0 >> 8) & 0xff, parm);\ + DST(op,i*8+ 2, (w0 >> 16) & 0xff, parm);\ + DST(op,i*8+ 3, (w0 >> 24) & 0xff, parm);\ + DST(op,i*8+ 4, (w0 >> 32) & 0xff, parm);\ + DST(op,i*8+ 5, (w0 >> 40) & 0xff, parm);\ + DST(op,i*8+ 6, (w0 >> 48) & 0xff, parm);\ + DST(op,i*8+ 7, (w0 >> 56) , parm);;\ +} + +#define BITUNPACK64_8(ip, op, parm) { \ + BITUNBLK64_8(ip, 0, op, parm);\ + BITUNBLK64_8(ip, 1, op, parm);\ + BITUNBLK64_8(ip, 2, op, parm);\ + BITUNBLK64_8(ip, 3, op, parm); DSTI(op); ip += 8*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_9(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1ff, parm);\ + DST(op,i*64+ 1, (w0 >> 9) & 0x1ff, parm);\ + DST(op,i*64+ 2, (w0 >> 18) & 0x1ff, parm);\ + DST(op,i*64+ 3, (w0 >> 27) & 0x1ff, parm);\ + DST(op,i*64+ 4, (w0 >> 36) & 0x1ff, parm);\ + DST(op,i*64+ 5, (w0 >> 45) & 0x1ff, parm);\ + DST(op,i*64+ 6, (w0 >> 54) & 0x1ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w0 >> 63) | (w1 << 1) & 0x1ff, parm);\ + DST(op,i*64+ 8, (w1 >> 8) & 0x1ff, parm);\ + DST(op,i*64+ 9, (w1 >> 17) & 0x1ff, parm);\ + DST(op,i*64+10, (w1 >> 26) & 0x1ff, parm);\ + DST(op,i*64+11, (w1 >> 35) & 0x1ff, parm);\ + DST(op,i*64+12, (w1 >> 44) & 0x1ff, parm);\ + DST(op,i*64+13, (w1 >> 53) & 0x1ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w1 >> 62) | (w2 << 2) & 0x1ff, parm);\ + DST(op,i*64+15, (w2 >> 7) & 0x1ff, parm);\ + DST(op,i*64+16, (w2 >> 16) & 0x1ff, parm);\ + DST(op,i*64+17, (w2 >> 25) & 0x1ff, parm);\ + DST(op,i*64+18, (w2 >> 34) & 0x1ff, parm);\ + DST(op,i*64+19, (w2 >> 43) & 0x1ff, parm);\ + DST(op,i*64+20, (w2 >> 52) & 0x1ff, parm); register uint32_t w4 = *(uint32_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w2 >> 61) | (w3 << 3) & 0x1ff, parm);\ + DST(op,i*64+22, (w3 >> 6) & 0x1ff, parm);\ + DST(op,i*64+23, (w3 >> 15) & 0x1ff, parm);\ + DST(op,i*64+24, (w3 >> 24) & 0x1ff, parm);\ + DST(op,i*64+25, (w3 >> 33) & 0x1ff, parm);\ + DST(op,i*64+26, (w3 >> 42) & 0x1ff, parm);\ + DST(op,i*64+27, (w3 >> 51) & 0x1ff, parm); \ +\ + DST(op,i*64+28, (w3 >> 60) | (w4 << 4) & 0x1ff, parm);\ + DST(op,i*64+29, (w4 >> 5) & 0x1ff, parm);\ + DST(op,i*64+30, (w4 >> 14) & 0x1ff, parm);\ + DST(op,i*64+31, (w4 >> 23) & 0x1ff, parm);;\ +} + +#define BITUNPACK64_9(ip, op, parm) { \ + BITUNBLK64_9(ip, 0, op, parm); DSTI(op); ip += 9*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_10(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3ff, parm);\ + DST(op,i*32+ 1, (w0 >> 10) & 0x3ff, parm);\ + DST(op,i*32+ 2, (w0 >> 20) & 0x3ff, parm);\ + DST(op,i*32+ 3, (w0 >> 30) & 0x3ff, parm);\ + DST(op,i*32+ 4, (w0 >> 40) & 0x3ff, parm);\ + DST(op,i*32+ 5, (w0 >> 50) & 0x3ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 6, (w0 >> 60) | (w1 << 4) & 0x3ff, parm);\ + DST(op,i*32+ 7, (w1 >> 6) & 0x3ff, parm);\ + DST(op,i*32+ 8, (w1 >> 16) & 0x3ff, parm);\ + DST(op,i*32+ 9, (w1 >> 26) & 0x3ff, parm);\ + DST(op,i*32+10, (w1 >> 36) & 0x3ff, parm);\ + DST(op,i*32+11, (w1 >> 46) & 0x3ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+12, (w1 >> 56) | (w2 << 8) & 0x3ff, parm);\ + DST(op,i*32+13, (w2 >> 2) & 0x3ff, parm);\ + DST(op,i*32+14, (w2 >> 12) & 0x3ff, parm);\ + DST(op,i*32+15, (w2 >> 22) & 0x3ff, parm);\ + DST(op,i*32+16, (w2 >> 32) & 0x3ff, parm);\ + DST(op,i*32+17, (w2 >> 42) & 0x3ff, parm);\ + DST(op,i*32+18, (w2 >> 52) & 0x3ff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+19, (w2 >> 62) | (w3 << 2) & 0x3ff, parm);\ + DST(op,i*32+20, (w3 >> 8) & 0x3ff, parm);\ + DST(op,i*32+21, (w3 >> 18) & 0x3ff, parm);\ + DST(op,i*32+22, (w3 >> 28) & 0x3ff, parm);\ + DST(op,i*32+23, (w3 >> 38) & 0x3ff, parm);\ + DST(op,i*32+24, (w3 >> 48) & 0x3ff, parm); \ +\ + DST(op,i*32+25, (w3 >> 58) | (w4 << 6) & 0x3ff, parm);\ + DST(op,i*32+26, (w4 >> 4) & 0x3ff, parm);\ + DST(op,i*32+27, (w4 >> 14) & 0x3ff, parm);\ + DST(op,i*32+28, (w4 >> 24) & 0x3ff, parm);\ + DST(op,i*32+29, (w4 >> 34) & 0x3ff, parm);\ + DST(op,i*32+30, (w4 >> 44) & 0x3ff, parm);\ + DST(op,i*32+31, (w4 >> 54) , parm);;\ +} + +#define BITUNPACK64_10(ip, op, parm) { \ + BITUNBLK64_10(ip, 0, op, parm); DSTI(op); ip += 10*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_11(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7ff, parm);\ + DST(op,i*64+ 1, (w0 >> 11) & 0x7ff, parm);\ + DST(op,i*64+ 2, (w0 >> 22) & 0x7ff, parm);\ + DST(op,i*64+ 3, (w0 >> 33) & 0x7ff, parm);\ + DST(op,i*64+ 4, (w0 >> 44) & 0x7ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 5, (w0 >> 55) | (w1 << 9) & 0x7ff, parm);\ + DST(op,i*64+ 6, (w1 >> 2) & 0x7ff, parm);\ + DST(op,i*64+ 7, (w1 >> 13) & 0x7ff, parm);\ + DST(op,i*64+ 8, (w1 >> 24) & 0x7ff, parm);\ + DST(op,i*64+ 9, (w1 >> 35) & 0x7ff, parm);\ + DST(op,i*64+10, (w1 >> 46) & 0x7ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w1 >> 57) | (w2 << 7) & 0x7ff, parm);\ + DST(op,i*64+12, (w2 >> 4) & 0x7ff, parm);\ + DST(op,i*64+13, (w2 >> 15) & 0x7ff, parm);\ + DST(op,i*64+14, (w2 >> 26) & 0x7ff, parm);\ + DST(op,i*64+15, (w2 >> 37) & 0x7ff, parm);\ + DST(op,i*64+16, (w2 >> 48) & 0x7ff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w2 >> 59) | (w3 << 5) & 0x7ff, parm);\ + DST(op,i*64+18, (w3 >> 6) & 0x7ff, parm);\ + DST(op,i*64+19, (w3 >> 17) & 0x7ff, parm);\ + DST(op,i*64+20, (w3 >> 28) & 0x7ff, parm);\ + DST(op,i*64+21, (w3 >> 39) & 0x7ff, parm);\ + DST(op,i*64+22, (w3 >> 50) & 0x7ff, parm); register uint32_t w5 = *(uint32_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w3 >> 61) | (w4 << 3) & 0x7ff, parm);\ + DST(op,i*64+24, (w4 >> 8) & 0x7ff, parm);\ + DST(op,i*64+25, (w4 >> 19) & 0x7ff, parm);\ + DST(op,i*64+26, (w4 >> 30) & 0x7ff, parm);\ + DST(op,i*64+27, (w4 >> 41) & 0x7ff, parm);\ + DST(op,i*64+28, (w4 >> 52) & 0x7ff, parm); \ +\ + DST(op,i*64+29, (w4 >> 63) | (w5 << 1) & 0x7ff, parm);\ + DST(op,i*64+30, (w5 >> 10) & 0x7ff, parm);\ + DST(op,i*64+31, (w5 >> 21) & 0x7ff, parm);;\ +} + +#define BITUNPACK64_11(ip, op, parm) { \ + BITUNBLK64_11(ip, 0, op, parm); DSTI(op); ip += 11*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_12(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (w0 ) & 0xfff, parm);\ + DST(op,i*16+ 1, (w0 >> 12) & 0xfff, parm);\ + DST(op,i*16+ 2, (w0 >> 24) & 0xfff, parm);\ + DST(op,i*16+ 3, (w0 >> 36) & 0xfff, parm);\ + DST(op,i*16+ 4, (w0 >> 48) & 0xfff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 5, (w0 >> 60) | (w1 << 4) & 0xfff, parm);\ + DST(op,i*16+ 6, (w1 >> 8) & 0xfff, parm);\ + DST(op,i*16+ 7, (w1 >> 20) & 0xfff, parm);\ + DST(op,i*16+ 8, (w1 >> 32) & 0xfff, parm);\ + DST(op,i*16+ 9, (w1 >> 44) & 0xfff, parm); \ +\ + DST(op,i*16+10, (w1 >> 56) | (w2 << 8) & 0xfff, parm);\ + DST(op,i*16+11, (w2 >> 4) & 0xfff, parm);\ + DST(op,i*16+12, (w2 >> 16) & 0xfff, parm);\ + DST(op,i*16+13, (w2 >> 28) & 0xfff, parm);\ + DST(op,i*16+14, (w2 >> 40) & 0xfff, parm);\ + DST(op,i*16+15, (w2 >> 52) , parm);;\ +} + +#define BITUNPACK64_12(ip, op, parm) { \ + BITUNBLK64_12(ip, 0, op, parm);\ + BITUNBLK64_12(ip, 1, op, parm); DSTI(op); ip += 12*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_13(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1fff, parm);\ + DST(op,i*64+ 1, (w0 >> 13) & 0x1fff, parm);\ + DST(op,i*64+ 2, (w0 >> 26) & 0x1fff, parm);\ + DST(op,i*64+ 3, (w0 >> 39) & 0x1fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w0 >> 52) | (w1 << 12) & 0x1fff, parm);\ + DST(op,i*64+ 5, (w1 >> 1) & 0x1fff, parm);\ + DST(op,i*64+ 6, (w1 >> 14) & 0x1fff, parm);\ + DST(op,i*64+ 7, (w1 >> 27) & 0x1fff, parm);\ + DST(op,i*64+ 8, (w1 >> 40) & 0x1fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w1 >> 53) | (w2 << 11) & 0x1fff, parm);\ + DST(op,i*64+10, (w2 >> 2) & 0x1fff, parm);\ + DST(op,i*64+11, (w2 >> 15) & 0x1fff, parm);\ + DST(op,i*64+12, (w2 >> 28) & 0x1fff, parm);\ + DST(op,i*64+13, (w2 >> 41) & 0x1fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w2 >> 54) | (w3 << 10) & 0x1fff, parm);\ + DST(op,i*64+15, (w3 >> 3) & 0x1fff, parm);\ + DST(op,i*64+16, (w3 >> 16) & 0x1fff, parm);\ + DST(op,i*64+17, (w3 >> 29) & 0x1fff, parm);\ + DST(op,i*64+18, (w3 >> 42) & 0x1fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+19, (w3 >> 55) | (w4 << 9) & 0x1fff, parm);\ + DST(op,i*64+20, (w4 >> 4) & 0x1fff, parm);\ + DST(op,i*64+21, (w4 >> 17) & 0x1fff, parm);\ + DST(op,i*64+22, (w4 >> 30) & 0x1fff, parm);\ + DST(op,i*64+23, (w4 >> 43) & 0x1fff, parm); register uint32_t w6 = *(uint32_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w4 >> 56) | (w5 << 8) & 0x1fff, parm);\ + DST(op,i*64+25, (w5 >> 5) & 0x1fff, parm);\ + DST(op,i*64+26, (w5 >> 18) & 0x1fff, parm);\ + DST(op,i*64+27, (w5 >> 31) & 0x1fff, parm);\ + DST(op,i*64+28, (w5 >> 44) & 0x1fff, parm); \ +\ + DST(op,i*64+29, (w5 >> 57) | (w6 << 7) & 0x1fff, parm);\ + DST(op,i*64+30, (w6 >> 6) & 0x1fff, parm);\ + DST(op,i*64+31, (w6 >> 19) & 0x1fff, parm);;\ +} + +#define BITUNPACK64_13(ip, op, parm) { \ + BITUNBLK64_13(ip, 0, op, parm); DSTI(op); ip += 13*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_14(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3fff, parm);\ + DST(op,i*32+ 1, (w0 >> 14) & 0x3fff, parm);\ + DST(op,i*32+ 2, (w0 >> 28) & 0x3fff, parm);\ + DST(op,i*32+ 3, (w0 >> 42) & 0x3fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 4, (w0 >> 56) | (w1 << 8) & 0x3fff, parm);\ + DST(op,i*32+ 5, (w1 >> 6) & 0x3fff, parm);\ + DST(op,i*32+ 6, (w1 >> 20) & 0x3fff, parm);\ + DST(op,i*32+ 7, (w1 >> 34) & 0x3fff, parm);\ + DST(op,i*32+ 8, (w1 >> 48) & 0x3fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 9, (w1 >> 62) | (w2 << 2) & 0x3fff, parm);\ + DST(op,i*32+10, (w2 >> 12) & 0x3fff, parm);\ + DST(op,i*32+11, (w2 >> 26) & 0x3fff, parm);\ + DST(op,i*32+12, (w2 >> 40) & 0x3fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+13, (w2 >> 54) | (w3 << 10) & 0x3fff, parm);\ + DST(op,i*32+14, (w3 >> 4) & 0x3fff, parm);\ + DST(op,i*32+15, (w3 >> 18) & 0x3fff, parm);\ + DST(op,i*32+16, (w3 >> 32) & 0x3fff, parm);\ + DST(op,i*32+17, (w3 >> 46) & 0x3fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+18, (w3 >> 60) | (w4 << 4) & 0x3fff, parm);\ + DST(op,i*32+19, (w4 >> 10) & 0x3fff, parm);\ + DST(op,i*32+20, (w4 >> 24) & 0x3fff, parm);\ + DST(op,i*32+21, (w4 >> 38) & 0x3fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+22, (w4 >> 52) | (w5 << 12) & 0x3fff, parm);\ + DST(op,i*32+23, (w5 >> 2) & 0x3fff, parm);\ + DST(op,i*32+24, (w5 >> 16) & 0x3fff, parm);\ + DST(op,i*32+25, (w5 >> 30) & 0x3fff, parm);\ + DST(op,i*32+26, (w5 >> 44) & 0x3fff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+27, (w5 >> 58) | (w6 << 6) & 0x3fff, parm);\ + DST(op,i*32+28, (w6 >> 8) & 0x3fff, parm);\ + DST(op,i*32+29, (w6 >> 22) & 0x3fff, parm);\ + DST(op,i*32+30, (w6 >> 36) & 0x3fff, parm);\ + DST(op,i*32+31, (w6 >> 50) , parm);;\ +} + +#define BITUNPACK64_14(ip, op, parm) { \ + BITUNBLK64_14(ip, 0, op, parm); DSTI(op); ip += 14*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_15(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7fff, parm);\ + DST(op,i*64+ 1, (w0 >> 15) & 0x7fff, parm);\ + DST(op,i*64+ 2, (w0 >> 30) & 0x7fff, parm);\ + DST(op,i*64+ 3, (w0 >> 45) & 0x7fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w0 >> 60) | (w1 << 4) & 0x7fff, parm);\ + DST(op,i*64+ 5, (w1 >> 11) & 0x7fff, parm);\ + DST(op,i*64+ 6, (w1 >> 26) & 0x7fff, parm);\ + DST(op,i*64+ 7, (w1 >> 41) & 0x7fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w1 >> 56) | (w2 << 8) & 0x7fff, parm);\ + DST(op,i*64+ 9, (w2 >> 7) & 0x7fff, parm);\ + DST(op,i*64+10, (w2 >> 22) & 0x7fff, parm);\ + DST(op,i*64+11, (w2 >> 37) & 0x7fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w2 >> 52) | (w3 << 12) & 0x7fff, parm);\ + DST(op,i*64+13, (w3 >> 3) & 0x7fff, parm);\ + DST(op,i*64+14, (w3 >> 18) & 0x7fff, parm);\ + DST(op,i*64+15, (w3 >> 33) & 0x7fff, parm);\ + DST(op,i*64+16, (w3 >> 48) & 0x7fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w3 >> 63) | (w4 << 1) & 0x7fff, parm);\ + DST(op,i*64+18, (w4 >> 14) & 0x7fff, parm);\ + DST(op,i*64+19, (w4 >> 29) & 0x7fff, parm);\ + DST(op,i*64+20, (w4 >> 44) & 0x7fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w4 >> 59) | (w5 << 5) & 0x7fff, parm);\ + DST(op,i*64+22, (w5 >> 10) & 0x7fff, parm);\ + DST(op,i*64+23, (w5 >> 25) & 0x7fff, parm);\ + DST(op,i*64+24, (w5 >> 40) & 0x7fff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w5 >> 55) | (w6 << 9) & 0x7fff, parm);\ + DST(op,i*64+26, (w6 >> 6) & 0x7fff, parm);\ + DST(op,i*64+27, (w6 >> 21) & 0x7fff, parm);\ + DST(op,i*64+28, (w6 >> 36) & 0x7fff, parm); register uint32_t w7 = *(uint32_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+29, (w6 >> 51) | (w7 << 13) & 0x7fff, parm);\ + DST(op,i*64+30, (w7 >> 2) & 0x7fff, parm);\ + DST(op,i*64+31, (w7 >> 17) & 0x7fff, parm);;\ +} + +#define BITUNPACK64_15(ip, op, parm) { \ + BITUNBLK64_15(ip, 0, op, parm); DSTI(op); ip += 15*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_16(ip, i, op, parm) { \ + DST(op,i*4+ 0, *(uint16_t *)(ip+i*8+ 0), parm);\ + DST(op,i*4+ 1, *(uint16_t *)(ip+i*8+ 2), parm);\ + DST(op,i*4+ 2, *(uint16_t *)(ip+i*8+ 4), parm);\ + DST(op,i*4+ 3, *(uint16_t *)(ip+i*8+ 6), parm);;\ +} + +#define BITUNPACK64_16(ip, op, parm) { \ + BITUNBLK64_16(ip, 0, op, parm);\ + BITUNBLK64_16(ip, 1, op, parm);\ + BITUNBLK64_16(ip, 2, op, parm);\ + BITUNBLK64_16(ip, 3, op, parm);\ + BITUNBLK64_16(ip, 4, op, parm);\ + BITUNBLK64_16(ip, 5, op, parm);\ + BITUNBLK64_16(ip, 6, op, parm);\ + BITUNBLK64_16(ip, 7, op, parm); DSTI(op); ip += 16*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_17(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*17+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1ffff, parm);\ + DST(op,i*64+ 1, (w0 >> 17) & 0x1ffff, parm);\ + DST(op,i*64+ 2, (w0 >> 34) & 0x1ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*17+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 3, (w0 >> 51) | (w1 << 13) & 0x1ffff, parm);\ + DST(op,i*64+ 4, (w1 >> 4) & 0x1ffff, parm);\ + DST(op,i*64+ 5, (w1 >> 21) & 0x1ffff, parm);\ + DST(op,i*64+ 6, (w1 >> 38) & 0x1ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*17+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w1 >> 55) | (w2 << 9) & 0x1ffff, parm);\ + DST(op,i*64+ 8, (w2 >> 8) & 0x1ffff, parm);\ + DST(op,i*64+ 9, (w2 >> 25) & 0x1ffff, parm);\ + DST(op,i*64+10, (w2 >> 42) & 0x1ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*17+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w2 >> 59) | (w3 << 5) & 0x1ffff, parm);\ + DST(op,i*64+12, (w3 >> 12) & 0x1ffff, parm);\ + DST(op,i*64+13, (w3 >> 29) & 0x1ffff, parm);\ + DST(op,i*64+14, (w3 >> 46) & 0x1ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*17+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w3 >> 63) | (w4 << 1) & 0x1ffff, parm);\ + DST(op,i*64+16, (w4 >> 16) & 0x1ffff, parm);\ + DST(op,i*64+17, (w4 >> 33) & 0x1ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*17+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w4 >> 50) | (w5 << 14) & 0x1ffff, parm);\ + DST(op,i*64+19, (w5 >> 3) & 0x1ffff, parm);\ + DST(op,i*64+20, (w5 >> 20) & 0x1ffff, parm);\ + DST(op,i*64+21, (w5 >> 37) & 0x1ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*17+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w5 >> 54) | (w6 << 10) & 0x1ffff, parm);\ + DST(op,i*64+23, (w6 >> 7) & 0x1ffff, parm);\ + DST(op,i*64+24, (w6 >> 24) & 0x1ffff, parm);\ + DST(op,i*64+25, (w6 >> 41) & 0x1ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*17+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w6 >> 58) | (w7 << 6) & 0x1ffff, parm);\ + DST(op,i*64+27, (w7 >> 11) & 0x1ffff, parm);\ + DST(op,i*64+28, (w7 >> 28) & 0x1ffff, parm);\ + DST(op,i*64+29, (w7 >> 45) & 0x1ffff, parm); register uint32_t w8 = *(uint32_t *)(ip+(i*17+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w7 >> 62) | (w8 << 2) & 0x1ffff, parm);\ + DST(op,i*64+31, (w8 >> 15) & 0x1ffff, parm);;\ +} + +#define BITUNPACK64_17(ip, op, parm) { \ + BITUNBLK64_17(ip, 0, op, parm); DSTI(op); ip += 17*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_18(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3ffff, parm);\ + DST(op,i*32+ 1, (w0 >> 18) & 0x3ffff, parm);\ + DST(op,i*32+ 2, (w0 >> 36) & 0x3ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 3, (w0 >> 54) | (w1 << 10) & 0x3ffff, parm);\ + DST(op,i*32+ 4, (w1 >> 8) & 0x3ffff, parm);\ + DST(op,i*32+ 5, (w1 >> 26) & 0x3ffff, parm);\ + DST(op,i*32+ 6, (w1 >> 44) & 0x3ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 7, (w1 >> 62) | (w2 << 2) & 0x3ffff, parm);\ + DST(op,i*32+ 8, (w2 >> 16) & 0x3ffff, parm);\ + DST(op,i*32+ 9, (w2 >> 34) & 0x3ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+10, (w2 >> 52) | (w3 << 12) & 0x3ffff, parm);\ + DST(op,i*32+11, (w3 >> 6) & 0x3ffff, parm);\ + DST(op,i*32+12, (w3 >> 24) & 0x3ffff, parm);\ + DST(op,i*32+13, (w3 >> 42) & 0x3ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w3 >> 60) | (w4 << 4) & 0x3ffff, parm);\ + DST(op,i*32+15, (w4 >> 14) & 0x3ffff, parm);\ + DST(op,i*32+16, (w4 >> 32) & 0x3ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*9+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w4 >> 50) | (w5 << 14) & 0x3ffff, parm);\ + DST(op,i*32+18, (w5 >> 4) & 0x3ffff, parm);\ + DST(op,i*32+19, (w5 >> 22) & 0x3ffff, parm);\ + DST(op,i*32+20, (w5 >> 40) & 0x3ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*9+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+21, (w5 >> 58) | (w6 << 6) & 0x3ffff, parm);\ + DST(op,i*32+22, (w6 >> 12) & 0x3ffff, parm);\ + DST(op,i*32+23, (w6 >> 30) & 0x3ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*9+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+24, (w6 >> 48) | (w7 << 16) & 0x3ffff, parm);\ + DST(op,i*32+25, (w7 >> 2) & 0x3ffff, parm);\ + DST(op,i*32+26, (w7 >> 20) & 0x3ffff, parm);\ + DST(op,i*32+27, (w7 >> 38) & 0x3ffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*9+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+28, (w7 >> 56) | (w8 << 8) & 0x3ffff, parm);\ + DST(op,i*32+29, (w8 >> 10) & 0x3ffff, parm);\ + DST(op,i*32+30, (w8 >> 28) & 0x3ffff, parm);\ + DST(op,i*32+31, (w8 >> 46) , parm);;\ +} + +#define BITUNPACK64_18(ip, op, parm) { \ + BITUNBLK64_18(ip, 0, op, parm); DSTI(op); ip += 18*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_19(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*19+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7ffff, parm);\ + DST(op,i*64+ 1, (w0 >> 19) & 0x7ffff, parm);\ + DST(op,i*64+ 2, (w0 >> 38) & 0x7ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*19+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 3, (w0 >> 57) | (w1 << 7) & 0x7ffff, parm);\ + DST(op,i*64+ 4, (w1 >> 12) & 0x7ffff, parm);\ + DST(op,i*64+ 5, (w1 >> 31) & 0x7ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*19+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w1 >> 50) | (w2 << 14) & 0x7ffff, parm);\ + DST(op,i*64+ 7, (w2 >> 5) & 0x7ffff, parm);\ + DST(op,i*64+ 8, (w2 >> 24) & 0x7ffff, parm);\ + DST(op,i*64+ 9, (w2 >> 43) & 0x7ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*19+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+10, (w2 >> 62) | (w3 << 2) & 0x7ffff, parm);\ + DST(op,i*64+11, (w3 >> 17) & 0x7ffff, parm);\ + DST(op,i*64+12, (w3 >> 36) & 0x7ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*19+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+13, (w3 >> 55) | (w4 << 9) & 0x7ffff, parm);\ + DST(op,i*64+14, (w4 >> 10) & 0x7ffff, parm);\ + DST(op,i*64+15, (w4 >> 29) & 0x7ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*19+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w4 >> 48) | (w5 << 16) & 0x7ffff, parm);\ + DST(op,i*64+17, (w5 >> 3) & 0x7ffff, parm);\ + DST(op,i*64+18, (w5 >> 22) & 0x7ffff, parm);\ + DST(op,i*64+19, (w5 >> 41) & 0x7ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*19+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+20, (w5 >> 60) | (w6 << 4) & 0x7ffff, parm);\ + DST(op,i*64+21, (w6 >> 15) & 0x7ffff, parm);\ + DST(op,i*64+22, (w6 >> 34) & 0x7ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*19+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w6 >> 53) | (w7 << 11) & 0x7ffff, parm);\ + DST(op,i*64+24, (w7 >> 8) & 0x7ffff, parm);\ + DST(op,i*64+25, (w7 >> 27) & 0x7ffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*19+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w7 >> 46) | (w8 << 18) & 0x7ffff, parm);\ + DST(op,i*64+27, (w8 >> 1) & 0x7ffff, parm);\ + DST(op,i*64+28, (w8 >> 20) & 0x7ffff, parm);\ + DST(op,i*64+29, (w8 >> 39) & 0x7ffff, parm); register uint32_t w9 = *(uint32_t *)(ip+(i*19+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w8 >> 58) | (w9 << 6) & 0x7ffff, parm);\ + DST(op,i*64+31, (w9 >> 13) & 0x7ffff, parm);;\ +} + +#define BITUNPACK64_19(ip, op, parm) { \ + BITUNBLK64_19(ip, 0, op, parm); DSTI(op); ip += 19*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_20(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (w0 ) & 0xfffff, parm);\ + DST(op,i*16+ 1, (w0 >> 20) & 0xfffff, parm);\ + DST(op,i*16+ 2, (w0 >> 40) & 0xfffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 3, (w0 >> 60) | (w1 << 4) & 0xfffff, parm);\ + DST(op,i*16+ 4, (w1 >> 16) & 0xfffff, parm);\ + DST(op,i*16+ 5, (w1 >> 36) & 0xfffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 6, (w1 >> 56) | (w2 << 8) & 0xfffff, parm);\ + DST(op,i*16+ 7, (w2 >> 12) & 0xfffff, parm);\ + DST(op,i*16+ 8, (w2 >> 32) & 0xfffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 9, (w2 >> 52) | (w3 << 12) & 0xfffff, parm);\ + DST(op,i*16+10, (w3 >> 8) & 0xfffff, parm);\ + DST(op,i*16+11, (w3 >> 28) & 0xfffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+12, (w3 >> 48) | (w4 << 16) & 0xfffff, parm);\ + DST(op,i*16+13, (w4 >> 4) & 0xfffff, parm);\ + DST(op,i*16+14, (w4 >> 24) & 0xfffff, parm);\ + DST(op,i*16+15, (w4 >> 44) , parm);;\ +} + +#define BITUNPACK64_20(ip, op, parm) { \ + BITUNBLK64_20(ip, 0, op, parm);\ + BITUNBLK64_20(ip, 1, op, parm); DSTI(op); ip += 20*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_21(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*21+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1fffff, parm);\ + DST(op,i*64+ 1, (w0 >> 21) & 0x1fffff, parm);\ + DST(op,i*64+ 2, (w0 >> 42) & 0x1fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*21+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 3, (w0 >> 63) | (w1 << 1) & 0x1fffff, parm);\ + DST(op,i*64+ 4, (w1 >> 20) & 0x1fffff, parm);\ + DST(op,i*64+ 5, (w1 >> 41) & 0x1fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*21+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w1 >> 62) | (w2 << 2) & 0x1fffff, parm);\ + DST(op,i*64+ 7, (w2 >> 19) & 0x1fffff, parm);\ + DST(op,i*64+ 8, (w2 >> 40) & 0x1fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*21+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w2 >> 61) | (w3 << 3) & 0x1fffff, parm);\ + DST(op,i*64+10, (w3 >> 18) & 0x1fffff, parm);\ + DST(op,i*64+11, (w3 >> 39) & 0x1fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*21+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w3 >> 60) | (w4 << 4) & 0x1fffff, parm);\ + DST(op,i*64+13, (w4 >> 17) & 0x1fffff, parm);\ + DST(op,i*64+14, (w4 >> 38) & 0x1fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*21+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w4 >> 59) | (w5 << 5) & 0x1fffff, parm);\ + DST(op,i*64+16, (w5 >> 16) & 0x1fffff, parm);\ + DST(op,i*64+17, (w5 >> 37) & 0x1fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*21+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w5 >> 58) | (w6 << 6) & 0x1fffff, parm);\ + DST(op,i*64+19, (w6 >> 15) & 0x1fffff, parm);\ + DST(op,i*64+20, (w6 >> 36) & 0x1fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*21+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w6 >> 57) | (w7 << 7) & 0x1fffff, parm);\ + DST(op,i*64+22, (w7 >> 14) & 0x1fffff, parm);\ + DST(op,i*64+23, (w7 >> 35) & 0x1fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*21+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w7 >> 56) | (w8 << 8) & 0x1fffff, parm);\ + DST(op,i*64+25, (w8 >> 13) & 0x1fffff, parm);\ + DST(op,i*64+26, (w8 >> 34) & 0x1fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*21+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+27, (w8 >> 55) | (w9 << 9) & 0x1fffff, parm);\ + DST(op,i*64+28, (w9 >> 12) & 0x1fffff, parm);\ + DST(op,i*64+29, (w9 >> 33) & 0x1fffff, parm); register uint32_t w10 = *(uint32_t *)(ip+(i*21+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w9 >> 54) | (w10 << 10) & 0x1fffff, parm);\ + DST(op,i*64+31, (w10 >> 11) & 0x1fffff, parm);;\ +} + +#define BITUNPACK64_21(ip, op, parm) { \ + BITUNBLK64_21(ip, 0, op, parm); DSTI(op); ip += 21*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_22(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3fffff, parm);\ + DST(op,i*32+ 1, (w0 >> 22) & 0x3fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 2, (w0 >> 44) | (w1 << 20) & 0x3fffff, parm);\ + DST(op,i*32+ 3, (w1 >> 2) & 0x3fffff, parm);\ + DST(op,i*32+ 4, (w1 >> 24) & 0x3fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 5, (w1 >> 46) | (w2 << 18) & 0x3fffff, parm);\ + DST(op,i*32+ 6, (w2 >> 4) & 0x3fffff, parm);\ + DST(op,i*32+ 7, (w2 >> 26) & 0x3fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 8, (w2 >> 48) | (w3 << 16) & 0x3fffff, parm);\ + DST(op,i*32+ 9, (w3 >> 6) & 0x3fffff, parm);\ + DST(op,i*32+10, (w3 >> 28) & 0x3fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+11, (w3 >> 50) | (w4 << 14) & 0x3fffff, parm);\ + DST(op,i*32+12, (w4 >> 8) & 0x3fffff, parm);\ + DST(op,i*32+13, (w4 >> 30) & 0x3fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w4 >> 52) | (w5 << 12) & 0x3fffff, parm);\ + DST(op,i*32+15, (w5 >> 10) & 0x3fffff, parm);\ + DST(op,i*32+16, (w5 >> 32) & 0x3fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*11+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w5 >> 54) | (w6 << 10) & 0x3fffff, parm);\ + DST(op,i*32+18, (w6 >> 12) & 0x3fffff, parm);\ + DST(op,i*32+19, (w6 >> 34) & 0x3fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*11+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+20, (w6 >> 56) | (w7 << 8) & 0x3fffff, parm);\ + DST(op,i*32+21, (w7 >> 14) & 0x3fffff, parm);\ + DST(op,i*32+22, (w7 >> 36) & 0x3fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*11+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+23, (w7 >> 58) | (w8 << 6) & 0x3fffff, parm);\ + DST(op,i*32+24, (w8 >> 16) & 0x3fffff, parm);\ + DST(op,i*32+25, (w8 >> 38) & 0x3fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*11+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+26, (w8 >> 60) | (w9 << 4) & 0x3fffff, parm);\ + DST(op,i*32+27, (w9 >> 18) & 0x3fffff, parm);\ + DST(op,i*32+28, (w9 >> 40) & 0x3fffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*11+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+29, (w9 >> 62) | (w10 << 2) & 0x3fffff, parm);\ + DST(op,i*32+30, (w10 >> 20) & 0x3fffff, parm);\ + DST(op,i*32+31, (w10 >> 42) , parm);;\ +} + +#define BITUNPACK64_22(ip, op, parm) { \ + BITUNBLK64_22(ip, 0, op, parm); DSTI(op); ip += 22*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_23(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*23+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7fffff, parm);\ + DST(op,i*64+ 1, (w0 >> 23) & 0x7fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*23+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 46) | (w1 << 18) & 0x7fffff, parm);\ + DST(op,i*64+ 3, (w1 >> 5) & 0x7fffff, parm);\ + DST(op,i*64+ 4, (w1 >> 28) & 0x7fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*23+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 5, (w1 >> 51) | (w2 << 13) & 0x7fffff, parm);\ + DST(op,i*64+ 6, (w2 >> 10) & 0x7fffff, parm);\ + DST(op,i*64+ 7, (w2 >> 33) & 0x7fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*23+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w2 >> 56) | (w3 << 8) & 0x7fffff, parm);\ + DST(op,i*64+ 9, (w3 >> 15) & 0x7fffff, parm);\ + DST(op,i*64+10, (w3 >> 38) & 0x7fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*23+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w3 >> 61) | (w4 << 3) & 0x7fffff, parm);\ + DST(op,i*64+12, (w4 >> 20) & 0x7fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*23+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+13, (w4 >> 43) | (w5 << 21) & 0x7fffff, parm);\ + DST(op,i*64+14, (w5 >> 2) & 0x7fffff, parm);\ + DST(op,i*64+15, (w5 >> 25) & 0x7fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*23+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w5 >> 48) | (w6 << 16) & 0x7fffff, parm);\ + DST(op,i*64+17, (w6 >> 7) & 0x7fffff, parm);\ + DST(op,i*64+18, (w6 >> 30) & 0x7fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*23+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+19, (w6 >> 53) | (w7 << 11) & 0x7fffff, parm);\ + DST(op,i*64+20, (w7 >> 12) & 0x7fffff, parm);\ + DST(op,i*64+21, (w7 >> 35) & 0x7fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*23+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w7 >> 58) | (w8 << 6) & 0x7fffff, parm);\ + DST(op,i*64+23, (w8 >> 17) & 0x7fffff, parm);\ + DST(op,i*64+24, (w8 >> 40) & 0x7fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*23+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w8 >> 63) | (w9 << 1) & 0x7fffff, parm);\ + DST(op,i*64+26, (w9 >> 22) & 0x7fffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*23+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+27, (w9 >> 45) | (w10 << 19) & 0x7fffff, parm);\ + DST(op,i*64+28, (w10 >> 4) & 0x7fffff, parm);\ + DST(op,i*64+29, (w10 >> 27) & 0x7fffff, parm); register uint32_t w11 = *(uint32_t *)(ip+(i*23+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w10 >> 50) | (w11 << 14) & 0x7fffff, parm);\ + DST(op,i*64+31, (w11 >> 9) & 0x7fffff, parm);;\ +} + +#define BITUNPACK64_23(ip, op, parm) { \ + BITUNBLK64_23(ip, 0, op, parm); DSTI(op); ip += 23*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_24(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ + DST(op,i*8+ 0, (w0 ) & 0xffffff, parm);\ + DST(op,i*8+ 1, (w0 >> 24) & 0xffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*8+ 2, (w0 >> 48) | (w1 << 16) & 0xffffff, parm);\ + DST(op,i*8+ 3, (w1 >> 8) & 0xffffff, parm);\ + DST(op,i*8+ 4, (w1 >> 32) & 0xffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*8+ 5, (w1 >> 56) | (w2 << 8) & 0xffffff, parm);\ + DST(op,i*8+ 6, (w2 >> 16) & 0xffffff, parm);\ + DST(op,i*8+ 7, (w2 >> 40) , parm);;\ +} + +#define BITUNPACK64_24(ip, op, parm) { \ + BITUNBLK64_24(ip, 0, op, parm);\ + BITUNBLK64_24(ip, 1, op, parm);\ + BITUNBLK64_24(ip, 2, op, parm);\ + BITUNBLK64_24(ip, 3, op, parm); DSTI(op); ip += 24*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_25(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*25+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1ffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 25) & 0x1ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*25+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 50) | (w1 << 14) & 0x1ffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 11) & 0x1ffffff, parm);\ + DST(op,i*64+ 4, (w1 >> 36) & 0x1ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*25+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 5, (w1 >> 61) | (w2 << 3) & 0x1ffffff, parm);\ + DST(op,i*64+ 6, (w2 >> 22) & 0x1ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*25+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w2 >> 47) | (w3 << 17) & 0x1ffffff, parm);\ + DST(op,i*64+ 8, (w3 >> 8) & 0x1ffffff, parm);\ + DST(op,i*64+ 9, (w3 >> 33) & 0x1ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*25+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+10, (w3 >> 58) | (w4 << 6) & 0x1ffffff, parm);\ + DST(op,i*64+11, (w4 >> 19) & 0x1ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*25+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w4 >> 44) | (w5 << 20) & 0x1ffffff, parm);\ + DST(op,i*64+13, (w5 >> 5) & 0x1ffffff, parm);\ + DST(op,i*64+14, (w5 >> 30) & 0x1ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*25+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w5 >> 55) | (w6 << 9) & 0x1ffffff, parm);\ + DST(op,i*64+16, (w6 >> 16) & 0x1ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*25+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w6 >> 41) | (w7 << 23) & 0x1ffffff, parm);\ + DST(op,i*64+18, (w7 >> 2) & 0x1ffffff, parm);\ + DST(op,i*64+19, (w7 >> 27) & 0x1ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*25+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+20, (w7 >> 52) | (w8 << 12) & 0x1ffffff, parm);\ + DST(op,i*64+21, (w8 >> 13) & 0x1ffffff, parm);\ + DST(op,i*64+22, (w8 >> 38) & 0x1ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*25+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w8 >> 63) | (w9 << 1) & 0x1ffffff, parm);\ + DST(op,i*64+24, (w9 >> 24) & 0x1ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*25+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+25, (w9 >> 49) | (w10 << 15) & 0x1ffffff, parm);\ + DST(op,i*64+26, (w10 >> 10) & 0x1ffffff, parm);\ + DST(op,i*64+27, (w10 >> 35) & 0x1ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*25+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w10 >> 60) | (w11 << 4) & 0x1ffffff, parm);\ + DST(op,i*64+29, (w11 >> 21) & 0x1ffffff, parm); register uint32_t w12 = *(uint32_t *)(ip+(i*25+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w11 >> 46) | (w12 << 18) & 0x1ffffff, parm);\ + DST(op,i*64+31, (w12 >> 7) & 0x1ffffff, parm);;\ +} + +#define BITUNPACK64_25(ip, op, parm) { \ + BITUNBLK64_25(ip, 0, op, parm); DSTI(op); ip += 25*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_26(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3ffffff, parm);\ + DST(op,i*32+ 1, (w0 >> 26) & 0x3ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 2, (w0 >> 52) | (w1 << 12) & 0x3ffffff, parm);\ + DST(op,i*32+ 3, (w1 >> 14) & 0x3ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 4, (w1 >> 40) | (w2 << 24) & 0x3ffffff, parm);\ + DST(op,i*32+ 5, (w2 >> 2) & 0x3ffffff, parm);\ + DST(op,i*32+ 6, (w2 >> 28) & 0x3ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 7, (w2 >> 54) | (w3 << 10) & 0x3ffffff, parm);\ + DST(op,i*32+ 8, (w3 >> 16) & 0x3ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 9, (w3 >> 42) | (w4 << 22) & 0x3ffffff, parm);\ + DST(op,i*32+10, (w4 >> 4) & 0x3ffffff, parm);\ + DST(op,i*32+11, (w4 >> 30) & 0x3ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+12, (w4 >> 56) | (w5 << 8) & 0x3ffffff, parm);\ + DST(op,i*32+13, (w5 >> 18) & 0x3ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w5 >> 44) | (w6 << 20) & 0x3ffffff, parm);\ + DST(op,i*32+15, (w6 >> 6) & 0x3ffffff, parm);\ + DST(op,i*32+16, (w6 >> 32) & 0x3ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*13+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w6 >> 58) | (w7 << 6) & 0x3ffffff, parm);\ + DST(op,i*32+18, (w7 >> 20) & 0x3ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*13+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+19, (w7 >> 46) | (w8 << 18) & 0x3ffffff, parm);\ + DST(op,i*32+20, (w8 >> 8) & 0x3ffffff, parm);\ + DST(op,i*32+21, (w8 >> 34) & 0x3ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*13+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+22, (w8 >> 60) | (w9 << 4) & 0x3ffffff, parm);\ + DST(op,i*32+23, (w9 >> 22) & 0x3ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*13+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+24, (w9 >> 48) | (w10 << 16) & 0x3ffffff, parm);\ + DST(op,i*32+25, (w10 >> 10) & 0x3ffffff, parm);\ + DST(op,i*32+26, (w10 >> 36) & 0x3ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*13+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+27, (w10 >> 62) | (w11 << 2) & 0x3ffffff, parm);\ + DST(op,i*32+28, (w11 >> 24) & 0x3ffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*13+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+29, (w11 >> 50) | (w12 << 14) & 0x3ffffff, parm);\ + DST(op,i*32+30, (w12 >> 12) & 0x3ffffff, parm);\ + DST(op,i*32+31, (w12 >> 38) , parm);;\ +} + +#define BITUNPACK64_26(ip, op, parm) { \ + BITUNBLK64_26(ip, 0, op, parm); DSTI(op); ip += 26*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_27(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*27+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7ffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 27) & 0x7ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*27+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 54) | (w1 << 10) & 0x7ffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 17) & 0x7ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*27+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w1 >> 44) | (w2 << 20) & 0x7ffffff, parm);\ + DST(op,i*64+ 5, (w2 >> 7) & 0x7ffffff, parm);\ + DST(op,i*64+ 6, (w2 >> 34) & 0x7ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*27+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 7, (w2 >> 61) | (w3 << 3) & 0x7ffffff, parm);\ + DST(op,i*64+ 8, (w3 >> 24) & 0x7ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*27+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 9, (w3 >> 51) | (w4 << 13) & 0x7ffffff, parm);\ + DST(op,i*64+10, (w4 >> 14) & 0x7ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*27+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w4 >> 41) | (w5 << 23) & 0x7ffffff, parm);\ + DST(op,i*64+12, (w5 >> 4) & 0x7ffffff, parm);\ + DST(op,i*64+13, (w5 >> 31) & 0x7ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*27+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w5 >> 58) | (w6 << 6) & 0x7ffffff, parm);\ + DST(op,i*64+15, (w6 >> 21) & 0x7ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*27+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w6 >> 48) | (w7 << 16) & 0x7ffffff, parm);\ + DST(op,i*64+17, (w7 >> 11) & 0x7ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*27+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w7 >> 38) | (w8 << 26) & 0x7ffffff, parm);\ + DST(op,i*64+19, (w8 >> 1) & 0x7ffffff, parm);\ + DST(op,i*64+20, (w8 >> 28) & 0x7ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*27+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+21, (w8 >> 55) | (w9 << 9) & 0x7ffffff, parm);\ + DST(op,i*64+22, (w9 >> 18) & 0x7ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*27+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+23, (w9 >> 45) | (w10 << 19) & 0x7ffffff, parm);\ + DST(op,i*64+24, (w10 >> 8) & 0x7ffffff, parm);\ + DST(op,i*64+25, (w10 >> 35) & 0x7ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*27+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w10 >> 62) | (w11 << 2) & 0x7ffffff, parm);\ + DST(op,i*64+27, (w11 >> 25) & 0x7ffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*27+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w11 >> 52) | (w12 << 12) & 0x7ffffff, parm);\ + DST(op,i*64+29, (w12 >> 15) & 0x7ffffff, parm); register uint32_t w13 = *(uint32_t *)(ip+(i*27+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w12 >> 42) | (w13 << 22) & 0x7ffffff, parm);\ + DST(op,i*64+31, (w13 >> 5) & 0x7ffffff, parm);;\ +} + +#define BITUNPACK64_27(ip, op, parm) { \ + BITUNBLK64_27(ip, 0, op, parm); DSTI(op); ip += 27*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_28(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ + DST(op,i*16+ 0, (w0 ) & 0xfffffff, parm);\ + DST(op,i*16+ 1, (w0 >> 28) & 0xfffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 2, (w0 >> 56) | (w1 << 8) & 0xfffffff, parm);\ + DST(op,i*16+ 3, (w1 >> 20) & 0xfffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 4, (w1 >> 48) | (w2 << 16) & 0xfffffff, parm);\ + DST(op,i*16+ 5, (w2 >> 12) & 0xfffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 6, (w2 >> 40) | (w3 << 24) & 0xfffffff, parm);\ + DST(op,i*16+ 7, (w3 >> 4) & 0xfffffff, parm);\ + DST(op,i*16+ 8, (w3 >> 32) & 0xfffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+ 9, (w3 >> 60) | (w4 << 4) & 0xfffffff, parm);\ + DST(op,i*16+10, (w4 >> 24) & 0xfffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+11, (w4 >> 52) | (w5 << 12) & 0xfffffff, parm);\ + DST(op,i*16+12, (w5 >> 16) & 0xfffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*16+13, (w5 >> 44) | (w6 << 20) & 0xfffffff, parm);\ + DST(op,i*16+14, (w6 >> 8) & 0xfffffff, parm);\ + DST(op,i*16+15, (w6 >> 36) , parm);;\ +} + +#define BITUNPACK64_28(ip, op, parm) { \ + BITUNBLK64_28(ip, 0, op, parm);\ + BITUNBLK64_28(ip, 1, op, parm); DSTI(op); ip += 28*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_29(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*29+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x1fffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 29) & 0x1fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*29+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 58) | (w1 << 6) & 0x1fffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 23) & 0x1fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*29+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w1 >> 52) | (w2 << 12) & 0x1fffffff, parm);\ + DST(op,i*64+ 5, (w2 >> 17) & 0x1fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*29+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w2 >> 46) | (w3 << 18) & 0x1fffffff, parm);\ + DST(op,i*64+ 7, (w3 >> 11) & 0x1fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*29+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w3 >> 40) | (w4 << 24) & 0x1fffffff, parm);\ + DST(op,i*64+ 9, (w4 >> 5) & 0x1fffffff, parm);\ + DST(op,i*64+10, (w4 >> 34) & 0x1fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*29+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+11, (w4 >> 63) | (w5 << 1) & 0x1fffffff, parm);\ + DST(op,i*64+12, (w5 >> 28) & 0x1fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*29+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+13, (w5 >> 57) | (w6 << 7) & 0x1fffffff, parm);\ + DST(op,i*64+14, (w6 >> 22) & 0x1fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*29+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+15, (w6 >> 51) | (w7 << 13) & 0x1fffffff, parm);\ + DST(op,i*64+16, (w7 >> 16) & 0x1fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*29+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+17, (w7 >> 45) | (w8 << 19) & 0x1fffffff, parm);\ + DST(op,i*64+18, (w8 >> 10) & 0x1fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*29+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+19, (w8 >> 39) | (w9 << 25) & 0x1fffffff, parm);\ + DST(op,i*64+20, (w9 >> 4) & 0x1fffffff, parm);\ + DST(op,i*64+21, (w9 >> 33) & 0x1fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*29+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w9 >> 62) | (w10 << 2) & 0x1fffffff, parm);\ + DST(op,i*64+23, (w10 >> 27) & 0x1fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*29+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w10 >> 56) | (w11 << 8) & 0x1fffffff, parm);\ + DST(op,i*64+25, (w11 >> 21) & 0x1fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*29+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w11 >> 50) | (w12 << 14) & 0x1fffffff, parm);\ + DST(op,i*64+27, (w12 >> 15) & 0x1fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*29+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w12 >> 44) | (w13 << 20) & 0x1fffffff, parm);\ + DST(op,i*64+29, (w13 >> 9) & 0x1fffffff, parm); register uint32_t w14 = *(uint32_t *)(ip+(i*29+14)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w13 >> 38) | (w14 << 26) & 0x1fffffff, parm);\ + DST(op,i*64+31, (w14 >> 3) & 0x1fffffff, parm);;\ +} + +#define BITUNPACK64_29(ip, op, parm) { \ + BITUNBLK64_29(ip, 0, op, parm); DSTI(op); ip += 29*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_30(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ + DST(op,i*32+ 0, (w0 ) & 0x3fffffff, parm);\ + DST(op,i*32+ 1, (w0 >> 30) & 0x3fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 2, (w0 >> 60) | (w1 << 4) & 0x3fffffff, parm);\ + DST(op,i*32+ 3, (w1 >> 26) & 0x3fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 4, (w1 >> 56) | (w2 << 8) & 0x3fffffff, parm);\ + DST(op,i*32+ 5, (w2 >> 22) & 0x3fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 6, (w2 >> 52) | (w3 << 12) & 0x3fffffff, parm);\ + DST(op,i*32+ 7, (w3 >> 18) & 0x3fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+ 8, (w3 >> 48) | (w4 << 16) & 0x3fffffff, parm);\ + DST(op,i*32+ 9, (w4 >> 14) & 0x3fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+10, (w4 >> 44) | (w5 << 20) & 0x3fffffff, parm);\ + DST(op,i*32+11, (w5 >> 10) & 0x3fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+12, (w5 >> 40) | (w6 << 24) & 0x3fffffff, parm);\ + DST(op,i*32+13, (w6 >> 6) & 0x3fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+14, (w6 >> 36) | (w7 << 28) & 0x3fffffff, parm);\ + DST(op,i*32+15, (w7 >> 2) & 0x3fffffff, parm);\ + DST(op,i*32+16, (w7 >> 32) & 0x3fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*15+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+17, (w7 >> 62) | (w8 << 2) & 0x3fffffff, parm);\ + DST(op,i*32+18, (w8 >> 28) & 0x3fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*15+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+19, (w8 >> 58) | (w9 << 6) & 0x3fffffff, parm);\ + DST(op,i*32+20, (w9 >> 24) & 0x3fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*15+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+21, (w9 >> 54) | (w10 << 10) & 0x3fffffff, parm);\ + DST(op,i*32+22, (w10 >> 20) & 0x3fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*15+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+23, (w10 >> 50) | (w11 << 14) & 0x3fffffff, parm);\ + DST(op,i*32+24, (w11 >> 16) & 0x3fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*15+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+25, (w11 >> 46) | (w12 << 18) & 0x3fffffff, parm);\ + DST(op,i*32+26, (w12 >> 12) & 0x3fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*15+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+27, (w12 >> 42) | (w13 << 22) & 0x3fffffff, parm);\ + DST(op,i*32+28, (w13 >> 8) & 0x3fffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*15+14)*8/sizeof(ip[0]));\ +\ + DST(op,i*32+29, (w13 >> 38) | (w14 << 26) & 0x3fffffff, parm);\ + DST(op,i*32+30, (w14 >> 4) & 0x3fffffff, parm);\ + DST(op,i*32+31, (w14 >> 34) , parm);;\ +} + +#define BITUNPACK64_30(ip, op, parm) { \ + BITUNBLK64_30(ip, 0, op, parm); DSTI(op); ip += 30*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_31(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*31+0)*8/sizeof(ip[0]));\ + DST(op,i*64+ 0, (w0 ) & 0x7fffffff, parm);\ + DST(op,i*64+ 1, (w0 >> 31) & 0x7fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*31+1)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 2, (w0 >> 62) | (w1 << 2) & 0x7fffffff, parm);\ + DST(op,i*64+ 3, (w1 >> 29) & 0x7fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*31+2)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 4, (w1 >> 60) | (w2 << 4) & 0x7fffffff, parm);\ + DST(op,i*64+ 5, (w2 >> 27) & 0x7fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*31+3)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 6, (w2 >> 58) | (w3 << 6) & 0x7fffffff, parm);\ + DST(op,i*64+ 7, (w3 >> 25) & 0x7fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*31+4)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+ 8, (w3 >> 56) | (w4 << 8) & 0x7fffffff, parm);\ + DST(op,i*64+ 9, (w4 >> 23) & 0x7fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*31+5)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+10, (w4 >> 54) | (w5 << 10) & 0x7fffffff, parm);\ + DST(op,i*64+11, (w5 >> 21) & 0x7fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*31+6)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+12, (w5 >> 52) | (w6 << 12) & 0x7fffffff, parm);\ + DST(op,i*64+13, (w6 >> 19) & 0x7fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*31+7)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+14, (w6 >> 50) | (w7 << 14) & 0x7fffffff, parm);\ + DST(op,i*64+15, (w7 >> 17) & 0x7fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*31+8)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+16, (w7 >> 48) | (w8 << 16) & 0x7fffffff, parm);\ + DST(op,i*64+17, (w8 >> 15) & 0x7fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*31+9)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+18, (w8 >> 46) | (w9 << 18) & 0x7fffffff, parm);\ + DST(op,i*64+19, (w9 >> 13) & 0x7fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*31+10)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+20, (w9 >> 44) | (w10 << 20) & 0x7fffffff, parm);\ + DST(op,i*64+21, (w10 >> 11) & 0x7fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*31+11)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+22, (w10 >> 42) | (w11 << 22) & 0x7fffffff, parm);\ + DST(op,i*64+23, (w11 >> 9) & 0x7fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*31+12)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+24, (w11 >> 40) | (w12 << 24) & 0x7fffffff, parm);\ + DST(op,i*64+25, (w12 >> 7) & 0x7fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*31+13)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+26, (w12 >> 38) | (w13 << 26) & 0x7fffffff, parm);\ + DST(op,i*64+27, (w13 >> 5) & 0x7fffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*31+14)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+28, (w13 >> 36) | (w14 << 28) & 0x7fffffff, parm);\ + DST(op,i*64+29, (w14 >> 3) & 0x7fffffff, parm); register uint32_t w15 = *(uint32_t *)(ip+(i*31+15)*8/sizeof(ip[0]));\ +\ + DST(op,i*64+30, (w14 >> 34) | (w15 << 30) & 0x7fffffff, parm);\ + DST(op,i*64+31, (w15 >> 1) & 0x7fffffff, parm);;\ +} + +#define BITUNPACK64_31(ip, op, parm) { \ + BITUNBLK64_31(ip, 0, op, parm); DSTI(op); ip += 31*4/sizeof(ip[0]);\ +} + +#define BITUNBLK64_32(ip, i, op, parm) { \ + DST(op,i*2+ 0, *(uint32_t *)(ip+i*8+ 0), parm);\ + DST(op,i*2+ 1, *(uint32_t *)(ip+i*8+ 4), parm);;\ +} + +#define BITUNPACK64_32(ip, op, parm) { \ + BITUNBLK64_32(ip, 0, op, parm);\ + BITUNBLK64_32(ip, 1, op, parm);\ + BITUNBLK64_32(ip, 2, op, parm);\ + BITUNBLK64_32(ip, 3, op, parm);\ + BITUNBLK64_32(ip, 4, op, parm);\ + BITUNBLK64_32(ip, 5, op, parm);\ + BITUNBLK64_32(ip, 6, op, parm);\ + BITUNBLK64_32(ip, 7, op, parm);\ + BITUNBLK64_32(ip, 8, op, parm);\ + BITUNBLK64_32(ip, 9, op, parm);\ + BITUNBLK64_32(ip, 10, op, parm);\ + BITUNBLK64_32(ip, 11, op, parm);\ + BITUNBLK64_32(ip, 12, op, parm);\ + BITUNBLK64_32(ip, 13, op, parm);\ + BITUNBLK64_32(ip, 14, op, parm);\ + BITUNBLK64_32(ip, 15, op, parm); DSTI(op); ip += 32*4/sizeof(ip[0]);\ +} + diff --git a/bitunpack_.h b/bitunpack_.h new file mode 100644 index 0000000..172e3d4 --- /dev/null +++ b/bitunpack_.h @@ -0,0 +1,112 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + bitunpack_.h - "Integer Compression" binary packing +**/ + +#include +#define DST( __op,__x, __w, __parm) *__op++ = BPI(__w,__parm) //__op[__x] = BPI(__w,__parm) // +#define DSTI(__op) //__op += 32 // + +#define USE_BITUNPACK 64 + + #if USE_BITUNPACK == 64 +#include "bitunpack64_.h" +#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0: do BITUNPACK64_0( __ip, __op, __parm) while(__op<__ope); break;\ + case 1: do BITUNPACK64_1( __ip, __op, __parm) while(__op<__ope); break;\ + case 2: do BITUNPACK64_2( __ip, __op, __parm) while(__op<__ope); break;\ + case 3: do BITUNPACK64_3( __ip, __op, __parm) while(__op<__ope); break;\ + case 4: do BITUNPACK64_4( __ip, __op, __parm) while(__op<__ope); break;\ + case 5: do BITUNPACK64_5( __ip, __op, __parm) while(__op<__ope); break;\ + case 6: do BITUNPACK64_6( __ip, __op, __parm) while(__op<__ope); break;\ + case 7: do BITUNPACK64_7( __ip, __op, __parm) while(__op<__ope); break;\ + case 8: do BITUNPACK64_8( __ip, __op, __parm) while(__op<__ope); break;\ + case 9: do BITUNPACK64_9( __ip, __op, __parm) while(__op<__ope); break;\ + case 10: do BITUNPACK64_10(__ip, __op, __parm) while(__op<__ope); break;\ + case 11: do BITUNPACK64_11(__ip, __op, __parm) while(__op<__ope); break;\ + case 12: do BITUNPACK64_12(__ip, __op, __parm) while(__op<__ope); break;\ + case 13: do BITUNPACK64_13(__ip, __op, __parm) while(__op<__ope); break;\ + case 14: do BITUNPACK64_14(__ip, __op, __parm) while(__op<__ope); break;\ + case 15: do BITUNPACK64_15(__ip, __op, __parm) while(__op<__ope); break;\ + case 16: do BITUNPACK64_16(__ip, __op, __parm) while(__op<__ope); break;\ + case 17: do BITUNPACK64_17(__ip, __op, __parm) while(__op<__ope); break;\ + case 18: do BITUNPACK64_18(__ip, __op, __parm) while(__op<__ope); break;\ + case 19: do BITUNPACK64_19(__ip, __op, __parm) while(__op<__ope); break;\ + case 20: do BITUNPACK64_20(__ip, __op, __parm) while(__op<__ope); break;\ + case 21: do BITUNPACK64_21(__ip, __op, __parm) while(__op<__ope); break;\ + case 22: do BITUNPACK64_22(__ip, __op, __parm) while(__op<__ope); break;\ + case 23: do BITUNPACK64_23(__ip, __op, __parm) while(__op<__ope); break;\ + case 24: do BITUNPACK64_24(__ip, __op, __parm) while(__op<__ope); break;\ + case 25: do BITUNPACK64_25(__ip, __op, __parm) while(__op<__ope); break;\ + case 26: do BITUNPACK64_26(__ip, __op, __parm) while(__op<__ope); break;\ + case 27: do BITUNPACK64_27(__ip, __op, __parm) while(__op<__ope); break;\ + case 28: do BITUNPACK64_28(__ip, __op, __parm) while(__op<__ope); break;\ + case 29: do BITUNPACK64_29(__ip, __op, __parm) while(__op<__ope); break;\ + case 30: do BITUNPACK64_30(__ip, __op, __parm) while(__op<__ope); break;\ + case 31: do BITUNPACK64_31(__ip, __op, __parm) while(__op<__ope); break;\ + case 32: do BITUNPACK64_32(__ip, __op, __parm) while(__op<__ope); break;\ + }\ +} + #elif USE_BITUNPACK == 32 +#include "bitunpack32_.h" +#define BITUNPACK32(__ip, __n, __nbits, __op, __parm) { typeof(__op[0]) *__ope = __op + __n;/*((__n+31)&0xffffffe0u)*/;\ + switch(__nbits) {\ + case 0: do BITUNPACK32_0( __ip, __op, __parm) while(__op<__ope); break;\ + case 1: do BITUNPACK32_1( __ip, __op, __parm) while(__op<__ope); break;\ + case 2: do BITUNPACK32_2( __ip, __op, __parm) while(__op<__ope); break;\ + case 3: do BITUNPACK32_3( __ip, __op, __parm) while(__op<__ope); break;\ + case 4: do BITUNPACK32_4( __ip, __op, __parm) while(__op<__ope); break;\ + case 5: do BITUNPACK32_5( __ip, __op, __parm) while(__op<__ope); break;\ + case 6: do BITUNPACK32_6( __ip, __op, __parm) while(__op<__ope); break;\ + case 7: do BITUNPACK32_7( __ip, __op, __parm) while(__op<__ope); break;\ + case 8: do BITUNPACK32_8( __ip, __op, __parm) while(__op<__ope); break;\ + case 9: do BITUNPACK32_9( __ip, __op, __parm) while(__op<__ope); break;\ + case 10: do BITUNPACK32_10(__ip, __op, __parm) while(__op<__ope); break;\ + case 11: do BITUNPACK32_11(__ip, __op, __parm) while(__op<__ope); break;\ + case 12: do BITUNPACK32_12(__ip, __op, __parm) while(__op<__ope); break;\ + case 13: do BITUNPACK32_13(__ip, __op, __parm) while(__op<__ope); break;\ + case 14: do BITUNPACK32_14(__ip, __op, __parm) while(__op<__ope); break;\ + case 15: do BITUNPACK32_15(__ip, __op, __parm) while(__op<__ope); break;\ + case 16: do BITUNPACK32_16(__ip, __op, __parm) while(__op<__ope); break;\ + case 17: do BITUNPACK32_17(__ip, __op, __parm) while(__op<__ope); break;\ + case 18: do BITUNPACK32_18(__ip, __op, __parm) while(__op<__ope); break;\ + case 19: do BITUNPACK32_19(__ip, __op, __parm) while(__op<__ope); break;\ + case 20: do BITUNPACK32_20(__ip, __op, __parm) while(__op<__ope); break;\ + case 21: do BITUNPACK32_21(__ip, __op, __parm) while(__op<__ope); break;\ + case 22: do BITUNPACK32_22(__ip, __op, __parm) while(__op<__ope); break;\ + case 23: do BITUNPACK32_23(__ip, __op, __parm) while(__op<__ope); break;\ + case 24: do BITUNPACK32_24(__ip, __op, __parm) while(__op<__ope); break;\ + case 25: do BITUNPACK32_25(__ip, __op, __parm) while(__op<__ope); break;\ + case 26: do BITUNPACK32_26(__ip, __op, __parm) while(__op<__ope); break;\ + case 27: do BITUNPACK32_27(__ip, __op, __parm) while(__op<__ope); break;\ + case 28: do BITUNPACK32_28(__ip, __op, __parm) while(__op<__ope); break;\ + case 29: do BITUNPACK32_29(__ip, __op, __parm) while(__op<__ope); break;\ + case 30: do BITUNPACK32_30(__ip, __op, __parm) while(__op<__ope); break;\ + case 31: do BITUNPACK32_31(__ip, __op, __parm) while(__op<__ope); break;\ + case 32: do BITUNPACK32_32(__ip, __op, __parm) while(__op<__ope); break;\ + } /*printf("n=%d,%d,%d ", __n, __op, __parm - sd, __op, __parme - __op);*/\ +} +#endif + diff --git a/conf.h b/conf.h new file mode 100644 index 0000000..2383ad1 --- /dev/null +++ b/conf.h @@ -0,0 +1,70 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + conf.h - "Integer Compression" config & common +**/ + + + #if defined(__GNUC__) +#define ALIGNED(t,v,n) __attribute__ ((aligned (n))) t v +#define ALWAYS_INLINE __attribute__((always_inline)) +#define _PACKED __attribute__ ((packed)) +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#define popcnt32(__x) __builtin_popcount(__x) +#define popcnt64(__x) __builtin_popcountll(__x) + +#define TEMPLATE2_(__x, __y) __x##__y +#define TEMPLATE2(__x, __y) TEMPLATE2_(__x,__y) + +#define TEMPLATE3_(x,y,z) x ## ## y ## z +#define TEMPLATE3(x,y,z) TEMPLATE3_(x, y, z) + + #if defined(__x86_64__) || defined(__x86_32__) +static inline int bsr32(int x) { + int b = -1; + asm("bsrl %1,%0" : "+r" (b): "rm" (x) ); + return b + 1; +} + +static inline int bsr64(unsigned long long x) { + return x?64 - __builtin_clzll(x):0; +} + +#define bsr16(__x) bsr32(__x) + #else +static inline int bsr32(int x) { + return x?32 - __builtin_clz(x):0; +} + +static inline int bsr64(unsigned long long x) { + return x?64 - __builtin_clzll(x):0; +} + #endif +#define ctzll(__x) __builtin_ctzll(__x) + #else +#error "only gcc support in this version" + #endif + + diff --git a/icbench.c b/icbench.c new file mode 100644 index 0000000..d417e9f --- /dev/null +++ b/icbench.c @@ -0,0 +1,617 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + icbench.c - "Integer Compression" benchmark program +**/ + +#include +#include +#include +#include + +#include +#include +#define PGM_FD(__f) struct stat sbuf; fstat(__f, &sbuf); __off64_t vlen = sbuf.st_size, vtel = 0; int pgm = 0; time_t t0 = time(NULL); +#define PGM_FDPUT(__f) vtel = lseek(__f, 0, SEEK_CUR);if(vtel*10/vlen != pgm) { double secs = time(NULL) - t0; pgm = vtel*10/vlen; printf("%d%%%.1f ", pgm, ((secs/60.0) * (vlen - vtel))/vtel); fflush(stdout); } +//------------------------------------------------------------------------------------------------------------- +typedef unsigned long long tm_t; +#define TM_TMAX (1ull<<63) + + #ifdef _MSC_VER // __rdtsc +#include + #else +#include + #endif + + #ifdef _WIN32 +#include +#define TM_T 1 + +static tm_t tmtime(void) { + LARGE_INTEGER tm; + QueryPerformanceCounter(&tm); + return (tm_t)(tm.QuadPart/tps.QuadPart); +} + +LARGE_INTEGER tps; +static tm_t tminit() { QueryPerformanceFrequency(&tps); tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } + #else +#include +#define TM_T 1000000.0 +static tm_t tmtime(void) { + struct timeval tm; + gettimeofday(&tm, NULL); + return (tm_t)tm.tv_sec*1000000ull + tm.tv_usec; +} + +static tm_t tminit() { tm_t t0=tmtime(),ts; while((ts = tmtime())==t0); return ts; } + #endif +//-------------------------------------------------------------------------------------------------------- +#include "vint.h" +#include "vsimple.h" + +#include "bitpack.h" +#include "bitunpack.h" +#include "vp4dc.h" +#include "vp4dd.h" + +#include "aux/vas16c.h" +#include "aux/vas16d.h" +#include "aux/OPT_PFD/opt_p4.h" +#include "aux/vabyte.h" +#include "aux/simple8b.h" +#include "aux/varintg8iu.h" + +unsigned char *simdpackwn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmask(in, (__m128i *)out, b); + return out; +} + +unsigned char *simdpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpack(in, (__m128i *)out, b); + return out; +} + +unsigned char *simdunpackn(uint32_t *in, uint32_t n, uint32_t b, uint32_t *out) { + uint32_t k, *out_; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpack(in, out, b); + return in; +} + +unsigned char *simdpackwn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) {//checkifdivisibleby(n, 128); const uint32_t * const initout(out); //while(needPaddingTo128Bits(out)) *out++ = 123456; + uint32_t *in_; + for(in_ = in + n; in + 128 <= in_; in += 128, out += 4 * b) simdpackwithoutmaskd1(start, in, (__m128i *)out, b); //simdpackwithoutmaskd1(x, ip+1, (__m128i *)out, b); + return out; +} + +unsigned char *simdunpackn1(uint32_t *in, uint32_t n, uint32_t b, uint32_t start, uint32_t *out) { + uint32_t k, *out_; + for(out_ = out + n; out + 128 <= out_; out += 128, in += 4 * b) simdunpackd1(start, in, out, b); + return in; +} + +unsigned char *u32enc(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) *out++ = *in++; return out;} +unsigned char *u32dec(unsigned *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n; while(out < out_) *out++ = *in++; return in;} + +#include "aux/vbyte_poly.h" +unsigned char *vavbyte1enc(int *in, int n, unsigned char *out) { + int i; for(i = 0; i < n; i++) { unsigned x = in[i]; VBYTE_ENC(out, x); } return out; +} +void vavbyte1dec(unsigned char *in, int n, int *out) { + int i; for(i = 0; i < n; i++) { unsigned x; VBYTE_DEC(in, x); out[i] = x; } return out; +} + +//------------------------------------------------------------------------------------------------- +#define VBLIM 64 +enum { + P_CPY, + P_VB, P_VBL, P_VG8, + P_PCK, P_PCKR, P_SIMDH, + P_SV, P_S16, P_S8BO, + P_P4D, P_P4DR, P_OPTP4 +}; + +unsigned char *beenc(unsigned *in, size_t n, unsigned char *out, int id, int bb) { + unsigned *ip=in; + int i,b; + + switch(id) { + case P_CPY: + out = u32enc( ip, n, out); break; + case P_VB: + out = vbenc( ip, n, out); break; + case P_VBL: + out = vbyteenc( ip, n, out); break; + case P_VG8: + out = vintg8enc(ip, n, out); break; + + //----------- simple ------------------- + case P_SV: + out = vsenc32( ip, n, out); break; + case P_S16: + { unsigned *c=ip,*ce=c+n; + while(c < ce) S16ENC(out, c, ce - c); + } + break; + case P_S8BO: + out = s8benco( ip, n, out); + break; + + //----------- PFOR ------------------- + case P_P4DR: + case P_P4D: + if(n>= 5; + } + *op = x; + in = bitunpack32( in, n-1, b, op+1); + } + break; + case P_PCKR: + { + unsigned x; + vbgeta(in, x, ;); + if(bb < 0) { + b = x & 0x1f; x >>= 5; + } + *op = x; + in = _bitunpackx32(in, n-1, b, op+1); + } + break; + case P_SIMDH: + if(n <129) in = vbytedec(in, n, op); + else { + unsigned x; + vbgeta(in, x, ;); + if(bb < 0) { + b = x & 0x1f; x >>= 5; + } + *op = x; + in = simdunpackn( in, n-1, b, op+1); + } + break; + default: printf("Fatal- Not entry %d", id); exit(0); + } + return in; +} + +struct libss { int id; char *s,*v; }; + +struct libss libss[] = { + { P_CPY, "copy", }, + { P_VB, "TurboVbyte" }, + { P_VBL, "Vbyte FPF" }, + { P_VG8, "vg8iu" }, + + { P_SV, "simpleV" }, + { P_S8BO, "simple 8b" }, + { P_S16, "simple16" }, + + { P_P4DR, "TurboPFor DA" }, + { P_P4D, "TurboPFor" }, + { P_OPTP4, "OptP4" }, + + { P_PCK, "TurboPack" }, + { P_PCKR, "TurboPack DA" }, + { P_SIMDH, "SIMDBitPack FPF" }, + { -1, "" }, +}; + +//--------------------------------------------------------------------------------------------- +#define MAXT 8 +#define BLK_SIZE 129 +#define MB (1024*1024) + +int verb = 0, reps = 100000, trips = 3; +enum { T_ZIPF=1, T_ID }; + +struct libs { int id,err; char *s,*v; unsigned long long l; double tc,td; }; +struct libs libs[64]; + +int l_cmp(struct libs *a, struct libs *b) { + if(a->l < b->l || a->l == b->l && a->td < b->td) return -1; + if(a->l > b->l || a->l == b->l && a->td > b->td) return 1; + return 0; +} + +void check(unsigned *in, unsigned n, unsigned *out, char *s) { + unsigned k,j; + for(k = 0; k < n; k++) + if(in[k] != out[k]) { + printf("\nFATAL in check %x,%x at %u[%u] in %s\n", in[k], out[k], k, n, s); + for(j=k & 0xffffff80u; j < k+128;j++) + printf("%d:%x,%x ", j, in[j], out[j] );printf("\n"); + exit(0); + } +} + +void print(unsigned long long n, char *s) { + int m, k; + for(k = 0; libs[k].id >= 0; k++); + qsort(libs, k, sizeof(libs[0]), l_cmp); + + for(m = 0; m < k; m++) + if(libs[m].l) { + struct libs *lb = &libs[m]; + printf("%-16s%12llu\t%5.2f\t%5.2f\t%8.2f\t%8.2f\t%s\n", s, lb->l, (double)lb->l*100.0/((double)n*4.0), (double)lb->l*8.0/(double)n, + lb->tc>=0.000001?((double)n/1000000.0) / (lb->tc/TM_T):0.0, + lb->td>=0.000001?((double)n/1000000.0) / (lb->td/TM_T):0.0, + lb->s ); + } +} + +//int libini() { int m; for(m = 0; libs[m].id >= 0; m++) libs[m].l = libs[m].tc = libs[m].td = 0; } + +unsigned bench(unsigned *__restrict__ _in, unsigned _inlen, int blksize, unsigned char *__restrict__ _out, unsigned long long outsize, char *inname, tm_t tx, unsigned *__restrict__ cpy, int bb) { int m,id,b=bb,i; if(verb) { printf(":%d,", _inlen); fflush(stdout);} + unsigned cn; tm_t tt0 = tminit(); + for(i = 0; i < 10; i++) memcpy(_out, _in, _inlen); + for(m = 0; (id=libs[m].id) >= 0; m++) { int r,insize=(id==P_OPTP4)?blksize-1:blksize; + struct libs *lb = &libs[m]; unsigned cl; if(verb) { printf("%s", libs[m].s);fflush(stdout); } int t,tj; tm_t t0,tc=TM_TMAX,td=TM_TMAX,tt; + for(t = 0; t < trips; t++) { t0 = tminit(); + for(r = 0; r < reps; ) { + cn=cl=0; + unsigned *in; + unsigned char *out,*sout; //vsini(); + for(out = _out, in = _in; in < _in+_inlen; ) { + unsigned n,inlen = *in++,*ip=in; in += inlen; + *(unsigned *)out = inlen; out+=4;/*out++=0x5a;*/ + for(;ip < in; ip += n) { n = ip+insize<=in?insize:in-ip; cn += n; unsigned char *sout=out; //printf("%d ", n); + out = beenc(ip,n,out,id,bb); + cl +=out-sout; + } if(out > _out+outsize) { fprintf(stderr, "Overflow error %lld, %lld in %s\n", outsize, (ptrdiff_t)(out - _out), lb->s); exit(0); } + } r++; if((tt = tmtime() - t0) > tx) break; + } if(tt < tc) { tc = tt; tj = r; } + if(tmtime() - tt0 > tx*trips) { /*printf("#");fflush(stdout);*/ /*sleep(1);*/tt0 = tminit(); } + } + lb->l += cl; lb->tc += tc/tj; memset(cpy, 0xf, _inlen*4); if(verb) { printf("+ ");fflush(stdout);} + tt0 = tminit(); + for(t = 0; t < trips; t++) { t0 = tminit(); + for(r = 0; r < reps; ) { unsigned *out; unsigned char *in; + for(out = cpy, in = _out; out < cpy+_inlen;) { + unsigned n,*op, outlen=*(unsigned *)in; in+=4; + *out++ = outlen; + for(op=out,out += outlen; op < out; op += n) { + n = op + insize<=out?insize:out-op; + in = bedec(in,n,op,id,bb); + } + } + r++; + if((tt = tmtime() - t0) > tx) + break; + } + if(tt < td) { + td = tt; + tj = r; + } + if(tmtime() - tt0 > tx*trips) { + tt0 = tminit(); + } + } lb->td += td/tj; + check(_in, _inlen, cpy, lb->s); + } + return cn; +} + +int z_cmp(double **a, double **b) { + if(*a < *b) return -1; + if(*a > *b) return 1; + return 0; +} + +void zipfgen(unsigned *a, double alpha, unsigned x1, unsigned x2, int n) { + int i,m = x2 - x1 + 1; + double prob, cum, *zmap; + if(!(zmap = malloc(m*sizeof(zmap[0])))) { + fprintf(stderr, "mallo error\n"); + exit(-1); + }; + + srand48(1); + for(cum =0.0,i = 0; i < m; i++) + cum += 1.0 / pow(i+1, alpha); + cum = 1.0 / cum; + for(prob=0.0,i = 0; i < m; i++) + zmap[i] = prob += cum / pow(i+1, alpha); + qsort(zmap, m, sizeof(zmap[0]), (int(*)(const void*,const void*))z_cmp); + + for(i = 0; i < n; i++) { + double r = drand48(); + int l = 0, h = m-1; + while(l < h) { + int k = (l + h) >> 1; + if(r > zmap[k]) l = k + 1; + else h = k; + } + a[i] = x1 + l; + } + free(zmap); +} + +#define OVD (10*MB) +int main(int argc, char *argv[]) { + char fname[0x100], *cmd=NULL; + unsigned bp=0,ftype = T_ID, rm=0,rx=30,n=10000000; + long long rdmax = 1<<30; tm_t tx=1*1000000; + double a = 1.5; + + tminit(); + VarIntG8IU(); + + int c, digit_optind = 0; + int this_option_optind = optind ? optind : 1, option_index = 0; + static struct option long_options[] = { {"repeat", 0, 0, 'r'}, {0,0, 0, 0} }; + for(;;) { + if((c = getopt_long(argc, argv, "Ac:TBR:ys:r:n:b:c:e:t:r:M:v:m:x:a:", long_options, &option_index)) == -1) break; + switch(c) { + case 0 : printf("Option %s", long_options[option_index].name); if(optarg) printf (" with arg %s", optarg); printf ("\n"); break; + case 'r': reps = atoi(optarg); break; + case 'R': trips = atoi(optarg); break; + case 'v': verb = atoi(optarg);verb++; break; + case 't': tx = atoi(optarg)*1000000; break; + case 'c': ftype = atoi(optarg); break; + case 'b': rdmax = atoi(optarg)*MB; break; + case 'e': cmd=optarg; break; + case 'm': rm = atoi(optarg); break; + case 'x': rx = atoi(optarg); break; // + case 'B': bp++; break; + case 'n': n = atoi(optarg); break; + case 'a': a = strtod(optarg, NULL); break; + default: fprintf(stdout,"unknown option: %c \n", optopt); exit(1); + } + } + int fno,i=0; //libini(); + if(!bp) { rm = (1< n) rx = n; } else if(!rm) rm = 1; + //printf("range=(%d,%d,%d)\n", rm, rx, n);fflush(stdout); + struct libss *ls; + if(cmd) { + unsigned char *q=NULL; + for(i=0,libs[0].id = -1;;) { + if(cmd) { + if(!*cmd) break; //printf("cmd='%s'", cmd); + q = strchr(cmd,','); + if(q) *q=' '; + if(q = strchr(cmd,'/')) + *q = '\0'; + for(ls = libss; ls->id >= 0; ls++) + if(!strcasecmp(ls->s, cmd)) { + memset(&libs[i], 0, sizeof(struct libs)); + libs[i].id = ls->id; + libs[i].err = 0; + libs[i].s = ls->s; + libs[i++].v = ls->v; + break; + } + if(ls->id < 0) { + printf("library: '%s' not found\n", cmd); + exit(-1); + } + cmd = q?(q+1):""; + } + } + } else for(ls = libss; ls->id >= 0; ls++) { + libs[i].id = ls->id; + libs[i].err = 0; + libs[i].s = ls->s; //printf("%s\n", ls->s);fflush(stdout); + libs[i++].v = ls->v; + } + libs[i].id = -1; + + if(argc <= optind) { + unsigned *in, *out, *cpy,*ip; unsigned long long totlen=0; + in = malloc(n*4+OVD); if(!in) { printf("malloc err=%u", n); exit(0); } + out = malloc(n*4+OVD); if(!out) { printf("malloc err=%u", n); exit(0); } + cpy = malloc(n*4+OVD); if(!cpy) { printf("malloc err=%u", n); exit(0); } + char s[33]; s[0]=0; + if(bp) { + int b; + printf("bittest\n"); fflush(stdout); + for(b = rm; b <= rx; b++) { + sprintf(s,"b=%d", b); + *in = n; + for(i = 1; i <= n; i++) + in[i] = (1ull << b)-1; + totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, b); + print(totlen, s); + } + } else { + printf("zipf a=%3.1f [%u,%u]\n", a, rm, rx); + *in = n; + zipfgen(in+1, a, rm, rx, n); //stprint(); + totlen = bench(in, n+1, BLK_SIZE, out, n*4+OVD, s, tx, cpy, -1); + print(totlen, s); + } + free(in); + free(cpy); + free(out); + } else for(fno = optind; fno < argc; fno++) { + char *inname = argv[fno]; + FILE *fi = fopen64(inname, "r"); + if(!fi) { + fprintf(stderr, "open error '%s'", inname); perror(inname); + exit(-1); + } + fseek(fi, 0, SEEK_END); + unsigned long long fisize = ftell(fi); + fseek(fi, 0, SEEK_SET); + if(fisize > rdmax) + fisize = rdmax; + fisize /= 4; //setvbuf(fi, NULL, _IOFBF, 1000*MB); + unsigned *in, *out, *cpy,*ip; + unsigned long long totlen=0; + int rc; + out = malloc(fisize*4+OVD); if(!out) { printf("malloc err=%u", fisize); exit(0); } + cpy = malloc(fisize*4+OVD); if(!cpy) { printf("malloc err=%u", fisize); exit(0); } + in = malloc(fisize*4+1024); if(!in) { printf("malloc err=%u", fisize); exit(0); } PGM_FD(fileno(fi)); + int r; fread(&r, 4, 1, fi); + while(r > 0) { + for(ip = in; ip+r <= in+fisize;) { + int rc; PGM_FDPUT(fileno(fi)); + if((rc = fread(ip+1, 4, r, fi)) <= 0) + goto a; + + if(r >= rm && r <= rx) { + *ip++ = r; + int j; + if(verb) + printf("%d ", r, ftype==T_ID?"I":"N"); + fflush(stdout); + if(ftype == T_ID) { + for(j = 0; j < r; ) { + unsigned m = j+BLK_SIZE>r?r-j:BLK_SIZE; + int i,did,dido = -1; + for(i = 0; i < m; i++) { + did = ip[i]; + if(did < dido) { + printf("IDs in '%s' not sorted.did=%d,dido=%d ", inname, did, dido); + exit(0); + } + ip[i] = did - dido - 1; + dido = /*ip[0]*/did; //printf("%d,", ip[i]); xbits[bsr32(ip[i])]++; + } + j += m; ip += m; //printf("\r"); + } + } else + ip += r; + } + r = rc = 0; + if(ftype == T_ID) + rc = fread(&r, 4, 1, fi); + if(rc <= 0 || !r) + break; + } + totlen += bench(in, ip-in, BLK_SIZE, out, fisize*4+OVD, inname, tx, cpy, -1); + if(totlen > n) + break; + } + a:fclose(fi); //stprint(); + print(totlen,inname); + free(in); + free(cpy); + free(out); + } +} + diff --git a/makefile b/makefile new file mode 100644 index 0000000..0488c6e --- /dev/null +++ b/makefile @@ -0,0 +1,28 @@ +# powturbo (c) Copyright 2007-2013 +CFLAGS=-ffast-math -fstrict-aliasing -march=native -w -fpermissive + +BIT=./ +all: icbench + +bitunpack.o: $(BIT)bitunpack.c $(BIT)bitunpack_.h $(BIT)bitunpack.h $(BIT)bitunpack64_.h + cc -O2 $(CFLAGS) -c $(BIT)bitunpack.c + +bitpack.o: $(BIT)bitpack.c $(BIT)bitpack_.h $(BIT)bitpack.h $(BIT)bitpack64_.h + cc -O2 $(CFLAGS) -c $(BIT)bitpack.c + +vp4dc.o: $(BIT)vp4dc.c + cc -O3 $(CFLAGS) -funroll-loops -c $(BIT)vp4dc.c + +SIMDCOMPD=aux/simdcomp/ +SIMDCOMP=$(SIMDCOMPD)bitpacka.o $(SIMDCOMPD)src/simdintegratedbitpacking.o $(SIMDCOMPD)src/simdcomputil.o $(SIMDCOMPD)src/simdbitpacking.o + +varintg8iu.o: $(BIT)aux/varintg8iu.c $(BIT)aux/varintg8iu.h + cc -O2 $(CFLAGS) -c -funroll-loops -std=c99 $(BIT)aux/varintg8iu.c + +icbench: icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o varintg8iu.o vp4dd.o vp4dc.o $(SIMDCOMP) + cc -O3 icbench.o bitpack.o bitunpack.o vsimple.o aux/simple8b.o vp4dd.o vp4dc.o varintg8iu.o $(SIMDCOMP) -lm -o icbench $(LFLAGS) + +.c.o: + cc -O3 $(CFLAGS) $< -c -o $@ + + diff --git a/vint.h b/vint.h new file mode 100644 index 0000000..5169b5c --- /dev/null +++ b/vint.h @@ -0,0 +1,70 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vint.h - "Integer Compression" variable byte +**/ + +#ifndef VINT_H +#define VINT_H +#include "conf.h" +//-------------------------------------- variable byte : 32 bits ---------------------------------------------------------------- + //0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 +static unsigned char vtab[]= { 1, 2, 1, 3, 1, 2, 1, 4, 1, 2, 1, 3, 1, 2, 1, 5 }; +#define vbvlen(__x) vtab[(__x)&0xf] + +#define vbputa(__op, __x, __act) {\ + if(likely(__x < (1<< 7))) { *__op++ = __x << 1; __act;}\ + else if(likely(__x < (1<<14))) { *(unsigned short *)__op = __x << 2 | 0x01; __op += 2; __act;}\ + else if(likely(__x < (1<<21))) { *(unsigned short *)__op = __x << 3 | 0x03; __op += 2; *__op++ = __x >> 13; __act;}\ + else if(likely(__x < (1<<28))) { *(unsigned *)__op = __x << 4 | 0x07; __op += 4; __act;}\ + else { *(unsigned *)__op = __x << 4 | 0x0f; __op += 4; *__op++ = __x >> 28; __act;}\ +} + +#define vbgeta(__ip, __x, __act) do { __x = *__ip;\ + if(!(__x & (1<<0))) { __x >>= 1; __ip++; __act;}\ + else if(!(__x & (1<<1))) { __x = (*(unsigned short *)__ip) >> 2; __ip += 2; __act;}\ + else if(!(__x & (1<<2))) { __x = (*(unsigned short *)__ip) >> 3 | *(__ip+2) << 13; __ip += 3; __act;}\ + else if(!(__x & (1<<3))) { __x = (*(unsigned *)__ip) >> 4; __ip += 4; __act;}\ + else { __x = (*(unsigned *)__ip) >> 4 | *(__ip+4) << 28; __ip += 5; __act;}\ +} while(0) + +#define vblen(_x_) ({ unsigned __x = _x_; __x > 0x7f?(__x > 0x3fff?(__x > 0x1fffff?(__x > 0x0fffffff?5:4):3):2):1; }) +#define vbput(__op, __x) { unsigned _x__ = __x; vbputa(__op, _x__, ;); } +#define vbget(__ip) ({ unsigned _x_; vbgeta(__ip, _x_, ;); _x_; }) + +static inline unsigned char *vbenc (unsigned *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned *in_ = in +n; while(in < in_) vbput(out, *in++); return out;} +static inline unsigned char *vbdec (unsigned char *__restrict__ in, int n, unsigned *__restrict__ out) { unsigned *out_ = out+n,x; while(out < out_) vbgeta(in, x, *out++ = x); return in;} + +//--------------------------------------- variable byte : 15 bits ------------------------------------------------------------------- +#define vblen16(__x) ((__x) > 0x7f?2:1) +#define vbput16(__op, __x) do { unsigned _x = __x; if(likely(_x < 0x80)) *__op++ = _x; else { *__op++ = (_x) >> 8 | 0x80; *__op++ = _x; } } while(0) +#define vbgeta16(__ip,__x, __act) do { if((__x = *__ip++) > 0x7f) __x = (__x & 0x7f) << 8 | *__ip++; __act; } while(0) +#define vbget16(__ip) ({ unsigned _x; vbgeta16(__ip, _x, ;); _x; }) + +static inline unsigned char *vbenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out) { unsigned short *in_ = in +n; while(in < in_) vbput16(out, *in++); return out;} +static inline unsigned char *vbdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out) { unsigned short *out_ = out+n,x; while(out < out_) vgeta16(in, x, *out++ = x); return in; } + +#endif + + + diff --git a/vp4dc.c b/vp4dc.c new file mode 100644 index 0000000..17d323f --- /dev/null +++ b/vp4dc.c @@ -0,0 +1,41 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd.c - "Integer Compression" Turbo PforDelta +**/ + +#include "conf.h" +#include "bitpack.h" +#include "vp4dc.h" + +#define PAD8(__x) ( (((__x)+8-1)/8) ) +#include + +#define USIZE 32 +#include "vp4dc_.h" + +#define USIZE 16 +#include "vp4dc_.h" + + + diff --git a/vp4dc.h b/vp4dc.h new file mode 100644 index 0000000..e23a94b --- /dev/null +++ b/vp4dc.h @@ -0,0 +1,27 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dc.h - "Integer Compression" Turbo PforDelta +**/ +unsigned char *p4denc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *p4denc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out); diff --git a/vp4dc_.h b/vp4dc_.h new file mode 100644 index 0000000..75fd9f3 --- /dev/null +++ b/vp4dc_.h @@ -0,0 +1,62 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dc_.c - "Integer Compression" Turbo PforDelta +**/ +#define uint_t TEMPLATE3(uint, USIZE, _t) + +unsigned char *TEMPLATE2(p4denc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ out) { + int i; unsigned cnt[USIZE+1] = {0}; uint_t b = 0; + for(i = 0; i < n; i++) b |= in[i], ++cnt[TEMPLATE2(bsr, USIZE)(in[i])]; + b = TEMPLATE2(bsr, USIZE)(b); + + unsigned xb=b, ml = PAD8(n*b)+1,x = cnt[b]; + for(i = b-1; i >= 0; i--) { + unsigned l = PAD8(n*i) + (x?(2+16+PAD8(x*(xb-i))):1); + if(l < ml) b = i, ml = l; + x += cnt[i]; /*if(x >= 64) break;*/ + } + if(xb == b) { + *out++ = b << 1; + return TEMPLATE2(bitpack, USIZE)(in, n, b, out); + } + xb-=b; + uint_t _in[0x100], inx[0x100]; unsigned miss[0x100]; + unsigned long long xmap[2]; xmap[0] = xmap[1] = 0; unsigned xn, msk = (1ull< msk; + } + for(i = 0; i < xn; i++) { + unsigned c = miss[i]; + inx[i] = in[c] >> b; + xmap[c>>6] |= (1ull<<(c&0x3f)); + } + *(unsigned short *)out = xb << 8 | b << 1 | 1; out += 2; out = TEMPLATE2(bitpack, USIZE)(_in, n, b, out); + *(unsigned long long *)out = xmap[0]; out += 8; + *(unsigned long long *)out = xmap[1]; out += 8; + memset(&inx[xn],0,128); + return TEMPLATE2(bitpack, USIZE)(inx, xn, xb, out); +} + diff --git a/vp4dd.c b/vp4dd.c new file mode 100644 index 0000000..2d9e452 --- /dev/null +++ b/vp4dd.c @@ -0,0 +1,40 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd.c - "Integer Compression" Turbo PforDelta +**/ + +#include "conf.h" +#include "bitunpack.h" +#include "vp4dd.h" + +#define PAD8(__x) ( (((__x)+8-1)/8) ) +#include +#define USIZE 32 +#include "vp4dd_.h" + +//#define USIZE 16 +//#include "vp4dd_.h" + + + diff --git a/vp4dd.h b/vp4dd.h new file mode 100644 index 0000000..71af111 --- /dev/null +++ b/vp4dd.h @@ -0,0 +1,73 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd.h - "Integer Compression" Turbo PforDelta +**/ +unsigned char *p4ddec32( unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); +unsigned char *p4ddecx32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); + +//----------------------------------------------------------------------- +#define P4D_PAD8(__x) ( (((__x)+8-1)/8) ) +#define P4D_XB(__x) ((__x & 1)?(__x >> 8):0) +#define P4D_B(__x) ((__x >> 1) & 0x3f) +#define P4D_ININC(__in, __x) __in += 1+(__x & 1) + +static inline unsigned vp4dbits(unsigned char *__restrict__ in, int *xb) { unsigned i = *(unsigned short *)in; *xb = P4D_XB(i); return P4D_B(i); } + +struct p4d { + unsigned long long *xmap; + unsigned char *ex; + unsigned i,xb,cum[2]; + int oval,idx; +}; + +static inline void p4dini(struct p4d *p4d, unsigned char **__restrict__ pin, int n, unsigned *b) { unsigned char *in = *pin; + static unsigned long long xmap[2] = { 0 }; + + unsigned i = *(unsigned short *)in; + p4d->i = i; + *b = P4D_B(i); + p4d->xb = P4D_XB(i); + P4D_ININC(in,i); + *pin = in; + + p4d->ex = in + P4D_PAD8(n*(*b)); + p4d->xmap = (i&1)?p4d->ex:xmap; + p4d->ex += (i&1)?16:0; + p4d->cum[0] = 0; + p4d->cum[1] = popcnt64(p4d->xmap[0]); + p4d->oval = p4d->idx = -1; +} + +static ALWAYS_INLINE unsigned vp4dget32(struct p4d p4d, unsigned char *__restrict__ in, unsigned b, unsigned idx) { unsigned bi, cl, u = _bitgetx32(in, b, idx*b); + if(unlikely(p4d.xmap[bi = idx>>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<>6] & (1ull<<(cl = idx & 0x3f)))) u |= _bitgetx32(p4d.ex, p4d.xb, (p4d.cum[bi] + popcnt64(p4d.xmap[bi] & ~((~0ull)<oval += vp4dget(*p4d, in, b, ++p4d->idx)+1; while(p4d->oval < val); return p4d->oval; } + diff --git a/vp4dd_.h b/vp4dd_.h new file mode 100644 index 0000000..f92ce5f --- /dev/null +++ b/vp4dd_.h @@ -0,0 +1,369 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vp4dd_.h - "Integer Compression" Turbo PforDelta +**/ + #ifdef __AVX2__ +#include + +static ALIGNED(unsigned char, shuffles[256][8], 32) = { + { 0,0,0,0,0,0,0,0 }, + { 0,1,1,1,1,1,1,1 }, + { 1,0,1,1,1,1,1,1 }, + { 0,1,2,2,2,2,2,2 }, + { 1,1,0,1,1,1,1,1 }, + { 0,2,1,2,2,2,2,2 }, + { 2,0,1,2,2,2,2,2 }, + { 0,1,2,3,3,3,3,3 }, + { 1,1,1,0,1,1,1,1 }, + { 0,2,2,1,2,2,2,2 }, + { 2,0,2,1,2,2,2,2 }, + { 0,1,3,2,3,3,3,3 }, + { 2,2,0,1,2,2,2,2 }, + { 0,3,1,2,3,3,3,3 }, + { 3,0,1,2,3,3,3,3 }, + { 0,1,2,3,4,4,4,4 }, + { 1,1,1,1,0,1,1,1 }, + { 0,2,2,2,1,2,2,2 }, + { 2,0,2,2,1,2,2,2 }, + { 0,1,3,3,2,3,3,3 }, + { 2,2,0,2,1,2,2,2 }, + { 0,3,1,3,2,3,3,3 }, + { 3,0,1,3,2,3,3,3 }, + { 0,1,2,4,3,4,4,4 }, + { 2,2,2,0,1,2,2,2 }, + { 0,3,3,1,2,3,3,3 }, + { 3,0,3,1,2,3,3,3 }, + { 0,1,4,2,3,4,4,4 }, + { 3,3,0,1,2,3,3,3 }, + { 0,4,1,2,3,4,4,4 }, + { 4,0,1,2,3,4,4,4 }, + { 0,1,2,3,4,5,5,5 }, + { 1,1,1,1,1,0,1,1 }, + { 0,2,2,2,2,1,2,2 }, + { 2,0,2,2,2,1,2,2 }, + { 0,1,3,3,3,2,3,3 }, + { 2,2,0,2,2,1,2,2 }, + { 0,3,1,3,3,2,3,3 }, + { 3,0,1,3,3,2,3,3 }, + { 0,1,2,4,4,3,4,4 }, + { 2,2,2,0,2,1,2,2 }, + { 0,3,3,1,3,2,3,3 }, + { 3,0,3,1,3,2,3,3 }, + { 0,1,4,2,4,3,4,4 }, + { 3,3,0,1,3,2,3,3 }, + { 0,4,1,2,4,3,4,4 }, + { 4,0,1,2,4,3,4,4 }, + { 0,1,2,3,5,4,5,5 }, + { 2,2,2,2,0,1,2,2 }, + { 0,3,3,3,1,2,3,3 }, + { 3,0,3,3,1,2,3,3 }, + { 0,1,4,4,2,3,4,4 }, + { 3,3,0,3,1,2,3,3 }, + { 0,4,1,4,2,3,4,4 }, + { 4,0,1,4,2,3,4,4 }, + { 0,1,2,5,3,4,5,5 }, + { 3,3,3,0,1,2,3,3 }, + { 0,4,4,1,2,3,4,4 }, + { 4,0,4,1,2,3,4,4 }, + { 0,1,5,2,3,4,5,5 }, + { 4,4,0,1,2,3,4,4 }, + { 0,5,1,2,3,4,5,5 }, + { 5,0,1,2,3,4,5,5 }, + { 0,1,2,3,4,5,6,6 }, + { 1,1,1,1,1,1,0,1 }, + { 0,2,2,2,2,2,1,2 }, + { 2,0,2,2,2,2,1,2 }, + { 0,1,3,3,3,3,2,3 }, + { 2,2,0,2,2,2,1,2 }, + { 0,3,1,3,3,3,2,3 }, + { 3,0,1,3,3,3,2,3 }, + { 0,1,2,4,4,4,3,4 }, + { 2,2,2,0,2,2,1,2 }, + { 0,3,3,1,3,3,2,3 }, + { 3,0,3,1,3,3,2,3 }, + { 0,1,4,2,4,4,3,4 }, + { 3,3,0,1,3,3,2,3 }, + { 0,4,1,2,4,4,3,4 }, + { 4,0,1,2,4,4,3,4 }, + { 0,1,2,3,5,5,4,5 }, + { 2,2,2,2,0,2,1,2 }, + { 0,3,3,3,1,3,2,3 }, + { 3,0,3,3,1,3,2,3 }, + { 0,1,4,4,2,4,3,4 }, + { 3,3,0,3,1,3,2,3 }, + { 0,4,1,4,2,4,3,4 }, + { 4,0,1,4,2,4,3,4 }, + { 0,1,2,5,3,5,4,5 }, + { 3,3,3,0,1,3,2,3 }, + { 0,4,4,1,2,4,3,4 }, + { 4,0,4,1,2,4,3,4 }, + { 0,1,5,2,3,5,4,5 }, + { 4,4,0,1,2,4,3,4 }, + { 0,5,1,2,3,5,4,5 }, + { 5,0,1,2,3,5,4,5 }, + { 0,1,2,3,4,6,5,6 }, + { 2,2,2,2,2,0,1,2 }, + { 0,3,3,3,3,1,2,3 }, + { 3,0,3,3,3,1,2,3 }, + { 0,1,4,4,4,2,3,4 }, + { 3,3,0,3,3,1,2,3 }, + { 0,4,1,4,4,2,3,4 }, + { 4,0,1,4,4,2,3,4 }, + { 0,1,2,5,5,3,4,5 }, + { 3,3,3,0,3,1,2,3 }, + { 0,4,4,1,4,2,3,4 }, + { 4,0,4,1,4,2,3,4 }, + { 0,1,5,2,5,3,4,5 }, + { 4,4,0,1,4,2,3,4 }, + { 0,5,1,2,5,3,4,5 }, + { 5,0,1,2,5,3,4,5 }, + { 0,1,2,3,6,4,5,6 }, + { 3,3,3,3,0,1,2,3 }, + { 0,4,4,4,1,2,3,4 }, + { 4,0,4,4,1,2,3,4 }, + { 0,1,5,5,2,3,4,5 }, + { 4,4,0,4,1,2,3,4 }, + { 0,5,1,5,2,3,4,5 }, + { 5,0,1,5,2,3,4,5 }, + { 0,1,2,6,3,4,5,6 }, + { 4,4,4,0,1,2,3,4 }, + { 0,5,5,1,2,3,4,5 }, + { 5,0,5,1,2,3,4,5 }, + { 0,1,6,2,3,4,5,6 }, + { 5,5,0,1,2,3,4,5 }, + { 0,6,1,2,3,4,5,6 }, + { 6,0,1,2,3,4,5,6 }, + { 0,1,2,3,4,5,6,7 }, + { 1,1,1,1,1,1,1,0 }, + { 0,2,2,2,2,2,2,1 }, + { 2,0,2,2,2,2,2,1 }, + { 0,1,3,3,3,3,3,2 }, + { 2,2,0,2,2,2,2,1 }, + { 0,3,1,3,3,3,3,2 }, + { 3,0,1,3,3,3,3,2 }, + { 0,1,2,4,4,4,4,3 }, + { 2,2,2,0,2,2,2,1 }, + { 0,3,3,1,3,3,3,2 }, + { 3,0,3,1,3,3,3,2 }, + { 0,1,4,2,4,4,4,3 }, + { 3,3,0,1,3,3,3,2 }, + { 0,4,1,2,4,4,4,3 }, + { 4,0,1,2,4,4,4,3 }, + { 0,1,2,3,5,5,5,4 }, + { 2,2,2,2,0,2,2,1 }, + { 0,3,3,3,1,3,3,2 }, + { 3,0,3,3,1,3,3,2 }, + { 0,1,4,4,2,4,4,3 }, + { 3,3,0,3,1,3,3,2 }, + { 0,4,1,4,2,4,4,3 }, + { 4,0,1,4,2,4,4,3 }, + { 0,1,2,5,3,5,5,4 }, + { 3,3,3,0,1,3,3,2 }, + { 0,4,4,1,2,4,4,3 }, + { 4,0,4,1,2,4,4,3 }, + { 0,1,5,2,3,5,5,4 }, + { 4,4,0,1,2,4,4,3 }, + { 0,5,1,2,3,5,5,4 }, + { 5,0,1,2,3,5,5,4 }, + { 0,1,2,3,4,6,6,5 }, + { 2,2,2,2,2,0,2,1 }, + { 0,3,3,3,3,1,3,2 }, + { 3,0,3,3,3,1,3,2 }, + { 0,1,4,4,4,2,4,3 }, + { 3,3,0,3,3,1,3,2 }, + { 0,4,1,4,4,2,4,3 }, + { 4,0,1,4,4,2,4,3 }, + { 0,1,2,5,5,3,5,4 }, + { 3,3,3,0,3,1,3,2 }, + { 0,4,4,1,4,2,4,3 }, + { 4,0,4,1,4,2,4,3 }, + { 0,1,5,2,5,3,5,4 }, + { 4,4,0,1,4,2,4,3 }, + { 0,5,1,2,5,3,5,4 }, + { 5,0,1,2,5,3,5,4 }, + { 0,1,2,3,6,4,6,5 }, + { 3,3,3,3,0,1,3,2 }, + { 0,4,4,4,1,2,4,3 }, + { 4,0,4,4,1,2,4,3 }, + { 0,1,5,5,2,3,5,4 }, + { 4,4,0,4,1,2,4,3 }, + { 0,5,1,5,2,3,5,4 }, + { 5,0,1,5,2,3,5,4 }, + { 0,1,2,6,3,4,6,5 }, + { 4,4,4,0,1,2,4,3 }, + { 0,5,5,1,2,3,5,4 }, + { 5,0,5,1,2,3,5,4 }, + { 0,1,6,2,3,4,6,5 }, + { 5,5,0,1,2,3,5,4 }, + { 0,6,1,2,3,4,6,5 }, + { 6,0,1,2,3,4,6,5 }, + { 0,1,2,3,4,5,7,6 }, + { 2,2,2,2,2,2,0,1 }, + { 0,3,3,3,3,3,1,2 }, + { 3,0,3,3,3,3,1,2 }, + { 0,1,4,4,4,4,2,3 }, + { 3,3,0,3,3,3,1,2 }, + { 0,4,1,4,4,4,2,3 }, + { 4,0,1,4,4,4,2,3 }, + { 0,1,2,5,5,5,3,4 }, + { 3,3,3,0,3,3,1,2 }, + { 0,4,4,1,4,4,2,3 }, + { 4,0,4,1,4,4,2,3 }, + { 0,1,5,2,5,5,3,4 }, + { 4,4,0,1,4,4,2,3 }, + { 0,5,1,2,5,5,3,4 }, + { 5,0,1,2,5,5,3,4 }, + { 0,1,2,3,6,6,4,5 }, + { 3,3,3,3,0,3,1,2 }, + { 0,4,4,4,1,4,2,3 }, + { 4,0,4,4,1,4,2,3 }, + { 0,1,5,5,2,5,3,4 }, + { 4,4,0,4,1,4,2,3 }, + { 0,5,1,5,2,5,3,4 }, + { 5,0,1,5,2,5,3,4 }, + { 0,1,2,6,3,6,4,5 }, + { 4,4,4,0,1,4,2,3 }, + { 0,5,5,1,2,5,3,4 }, + { 5,0,5,1,2,5,3,4 }, + { 0,1,6,2,3,6,4,5 }, + { 5,5,0,1,2,5,3,4 }, + { 0,6,1,2,3,6,4,5 }, + { 6,0,1,2,3,6,4,5 }, + { 0,1,2,3,4,7,5,6 }, + { 3,3,3,3,3,0,1,2 }, + { 0,4,4,4,4,1,2,3 }, + { 4,0,4,4,4,1,2,3 }, + { 0,1,5,5,5,2,3,4 }, + { 4,4,0,4,4,1,2,3 }, + { 0,5,1,5,5,2,3,4 }, + { 5,0,1,5,5,2,3,4 }, + { 0,1,2,6,6,3,4,5 }, + { 4,4,4,0,4,1,2,3 }, + { 0,5,5,1,5,2,3,4 }, + { 5,0,5,1,5,2,3,4 }, + { 0,1,6,2,6,3,4,5 }, + { 5,5,0,1,5,2,3,4 }, + { 0,6,1,2,6,3,4,5 }, + { 6,0,1,2,6,3,4,5 }, + { 0,1,2,3,7,4,5,6 }, + { 4,4,4,4,0,1,2,3 }, + { 0,5,5,5,1,2,3,4 }, + { 5,0,5,5,1,2,3,4 }, + { 0,1,6,6,2,3,4,5 }, + { 5,5,0,5,1,2,3,4 }, + { 0,6,1,6,2,3,4,5 }, + { 6,0,1,6,2,3,4,5 }, + { 0,1,2,7,3,4,5,6 }, + { 5,5,5,0,1,2,3,4 }, + { 0,6,6,1,2,3,4,5 }, + { 6,0,6,1,2,3,4,5 }, + { 0,1,7,2,3,4,5,6 }, + { 6,6,0,1,2,3,4,5 }, + { 0,7,1,2,3,4,5,6 }, + { 7,0,1,2,3,4,5,6 }, + { 0,1,2,3,4,5,6,7, } + }; + + #elif defined(__SSE4_1__) +#include + #endif + +#define uint_t TEMPLATE3(uint, USIZE, _t) + +unsigned char *TEMPLATE2(p4ddec, USIZE)(unsigned char *__restrict__ in, int n, uint_t *__restrict__ out) { + uint_t ex[0x100+8]; unsigned i = *(unsigned short *)in; uint_t b = P4D_B(i); unsigned xb = P4D_XB(i); + P4D_ININC(in,i); + in = TEMPLATE2(bitunpack, USIZE)(in, n, b, out); + if(i & 1) { + unsigned long long b0 = *(unsigned long long *)in; in += 8; unsigned long long b1 = *(unsigned long long *)in; in += 8; + in = TEMPLATE2(bitunpack, USIZE)(in, popcnt64(b0) + popcnt64(b1), xb, ex); + #ifdef __AVX2__ + unsigned *op,*pex = ex; + for(op = out; b0; b0 >>= 8,op += 8) { const unsigned m = (unsigned char)b0, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } + for(op = out+64; b1; b1 >>= 8,op += 8) { const unsigned m = (unsigned char)b1, mc=popcnt32(m), s = pex[mc]; pex[mc]=0; + _mm256_storeu_si256((__m256i *)op, _mm256_add_epi32(_mm256_loadu_si256((const __m256i*)op), _mm256_permutevar8x32_epi32(_mm256_slli_epi32(_mm256_load_si256((const __m256i*)pex), b), _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)shuffles[m])) )) ); pex += mc; *pex=s; + } + #elif defined(__SSE4_1__) + + static ALIGNED(char, shuffles[16][16], 16) = { + #define _ 0x80 + { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, + { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, + { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, + { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, + { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, + { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, + { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, + { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, + { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, + #undef _ + }; + unsigned *op,*pex = ex; + for(op = out; b0; b0 >>= 4,op+=4) { const unsigned m = b0&0xf; + _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); + } + for(op=out+64; b1; b1 >>= 4,op+=4) { const unsigned m = b1&0xf; + _mm_storeu_si128((__m128i *)op, _mm_add_epi32(_mm_loadu_si128((__m128i*)op), _mm_shuffle_epi8(_mm_slli_epi32(_mm_load_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m); + } + #else + unsigned k = 0; + while(b0) { unsigned x = ctzll(b0); out[x] += ex[k++]<i&1)?(p4d->xmap+2):p4d->in+ PAD8(n*xb); +} + #endif diff --git a/vsimple.c b/vsimple.c new file mode 100644 index 0000000..f8bff77 --- /dev/null +++ b/vsimple.c @@ -0,0 +1,42 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vsimple.c - "Integer Compression" variable simple +**/ + +#include "vsimple.h" + +#define USE_RLE + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 +#define SV_LIM unsigned char s_lim[] = { 0, 28, 28, 28, 28, 36, 36, 36, 36, 36, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 0 }; +#define SV_ITM unsigned s_itm[] = { -1, 28, 14, 9, 7, 7, 6, 5, 4, 4, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, -1 } +static SV_ITM; +static SV_LIM; + +#include +#define USIZE 32 +#include "vsimple_.h" + +#define USIZE 16 +#include "vsimple_.h" + diff --git a/vsimple.h b/vsimple.h new file mode 100644 index 0000000..b1684f4 --- /dev/null +++ b/vsimple.h @@ -0,0 +1,42 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vsimple.h - "Integer Compression" variable simple +**/ + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned char *vsenc32(unsigned *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *vsdec32(unsigned char *__restrict__ in, int n, unsigned *__restrict__ out); + +unsigned char *vsenc16(unsigned short *__restrict__ in, int n, unsigned char *__restrict__ out); +unsigned char *vsdec16(unsigned char *__restrict__ in, int n, unsigned short *__restrict__ out); + +#ifdef __cplusplus +} +#endif + + + diff --git a/vsimple_.h b/vsimple_.h new file mode 100644 index 0000000..59f1dbe --- /dev/null +++ b/vsimple_.h @@ -0,0 +1,396 @@ +/** + Copyright (C) powturbo 2013-2014 + GPL v2 License + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + + - email : powturbo@gmail.com + - github : https://github.com/powturbo + - homepage : https://sites.google.com/site/powturbo/ + - twitter : https://twitter.com/powturbo + + vsimple_.h - "Integer Compression" variable simple +**/ + +#include "vint.h" +#define uint_t TEMPLATE3(uint, USIZE, _t) + +unsigned char *TEMPLATE2(vsenc, USIZE)(uint_t *__restrict__ in, int n, unsigned char *__restrict__ op) { + unsigned xm,m,r; + uint_t *e = in+n,*ip; + for(ip = in; ip < e; ) { + #ifdef USE_RLE + if(ip < e-4 && *ip == *(ip+1)) { uint_t *q = ip+1; while(q < e-1 && *(q+1) == *ip) q++; r = q - ip; + if(r*TEMPLATE2(bsr, USIZE)(*ip) > 16 || !*ip && r>4) { m = (*ip)?33:0; goto a; } + } else + #endif + r = 0; unsigned x = m = bsr32(*ip); + while((r+1)*(xm = x > m?x:m) <= s_lim[xm]) { m = xm; x = TEMPLATE2(bsr, USIZE)(*(ip+(++r))); } + if(/*xm != 32 &&*/ m) while(r < s_itm[m]) m++; + a:; + switch(m) { + case 0: ip += r; + if(--r >= 0xf) { + *op++ = 0xf0; + if(n <= 0x100) + *op++ = r; + else + vbput(op, r); + } else *op++ = r<<4; + break; + case 1: + *(unsigned *)op = 1 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 5 | + (unsigned)ip[ 2] << 6 | + (unsigned)ip[ 3] << 7 | + (unsigned)ip[ 4] << 8 | + (unsigned)ip[ 5] << 9 | + (unsigned)ip[ 6] << 10 | + (unsigned)ip[ 7] << 11 | + (unsigned)ip[ 8] << 12 | + (unsigned)ip[ 9] << 13 | + (unsigned)ip[10] << 14 | + (unsigned)ip[11] << 15 | + (unsigned)ip[12] << 16 | + (unsigned)ip[13] << 17 | + (unsigned)ip[14] << 18 | + (unsigned)ip[15] << 19 | + (unsigned)ip[16] << 20 | + (unsigned)ip[17] << 21 | + (unsigned)ip[18] << 22 | + (unsigned)ip[19] << 23 | + (unsigned)ip[20] << 24 | + (unsigned)ip[21] << 25 | + (unsigned)ip[22] << 26 | + (unsigned)ip[23] << 27 | + (unsigned)ip[24] << 28 | + (unsigned)ip[25] << 29 | + (unsigned)ip[26] << 30 | + (unsigned)ip[27] << 31; ip += 28; op += 4; + break; + case 2: + *(unsigned *)op = 2 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 6 | + (unsigned)ip[ 2] << 8 | + (unsigned)ip[ 3] << 10 | + (unsigned)ip[ 4] << 12 | + (unsigned)ip[ 5] << 14 | + (unsigned)ip[ 6] << 16 | + (unsigned)ip[ 7] << 18 | + (unsigned)ip[ 8] << 20 | + (unsigned)ip[ 9] << 22 | + (unsigned)ip[10] << 24 | + (unsigned)ip[11] << 26 | + (unsigned)ip[12] << 28 | + (unsigned)ip[13] << 30; ip += 14; op += 4; + break; + case 3: + *(unsigned *)op = 3 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 7 | + (unsigned)ip[ 2] << 10 | + (unsigned)ip[ 3] << 13 | + (unsigned)ip[ 4] << 16 | + (unsigned)ip[ 5] << 19 | + (unsigned)ip[ 6] << 22 | + (unsigned)ip[ 7] << 25 | + (unsigned)ip[ 8] << 28; ip += 9; op += 4; + break; + case 4: + *(uint64_t *)op = 4 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 8 | + (unsigned)ip[ 2] << 12 | + (unsigned)ip[ 3] << 16 | + (unsigned)ip[ 4] << 20 | + (unsigned)ip[ 5] << 24 | + (unsigned)ip[ 6] << 28; ip += 7; op += 4; + break; + case 5: + *(uint64_t *)op = 5 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 9 | + (unsigned)ip[ 2] << 14 | + (unsigned)ip[ 3] << 19 | + (unsigned)ip[ 4] << 24 | + (uint64_t)ip[ 5] << 29 | + (uint64_t)ip[ 6] << 34; ip += 7; op += 5; + break; + case 6: + *(uint64_t *)op = 6 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 10 | + (unsigned)ip[ 2] << 16 | + (unsigned)ip[ 3] << 22 | + (uint64_t)ip[ 4] << 28 | + (uint64_t)ip[ 5] << 34; ip += 6; op += 5; + break; + case 7: + *(uint64_t *)op = 7 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 11 | + (unsigned)ip[ 2] << 18 | + (uint64_t)ip[ 3] << 25 | + (uint64_t)ip[ 4] << 32; ip += 5; op += 5; + break; + case 8: + case 9: + *(uint64_t *)op = 9 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 13 | + (unsigned)ip[ 2] << 22 | + (uint64_t)ip[ 3] << 31; ip += 4; op += 5; + break; + case 10: + *(uint64_t *)op = 10 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 14 | + (uint64_t)ip[ 2] << 24 | + (uint64_t)ip[ 3] << 34 | + (uint64_t)ip[ 4] << 44 | + (uint64_t)ip[ 5] << 54; ip += 6; op += 8; + break; + + case 11: + case 12: + *(uint64_t *)op = 12 | + (unsigned)ip[ 0] << 4 | + (unsigned)ip[ 1] << 16 | + (uint64_t)ip[ 2] << 28 | + (uint64_t)ip[ 3] << 40 | + (uint64_t)ip[ 4] << 52; ip += 5; op += 8; + break; + case 13: + case 14: + case 15: + *(uint64_t *)op = 15 | + (unsigned)ip[ 0] << 4 | + (uint64_t)ip[ 1] << 19 | + (uint64_t)ip[ 2] << 34 | + (uint64_t)ip[ 3] << 49; ip += 4; op += 8; + break; + case 16: + case 17: + case 18: + case 19: + case 20: + *(uint64_t *)op = 11 | + (unsigned)ip[ 0] << 4 | + (uint64_t)ip[ 1] << 24 | + (uint64_t)ip[ 2] << 44; ip += 3; op += 8; + break; + case 21: + case 22: + case 23: + case 24: + case 25: + case 26: + case 27: + case 28: + case 29: + case 30: + *(uint64_t *)op = 13 | + (unsigned)ip[ 0] << 4 | + (uint64_t)ip[ 1] << 34; ip += 2; op += 8; + break; + case 31: + case 32: + *(uint64_t *)op = 14 | + (uint64_t)ip[ 0] << 4; ip++; op += 5; + break; + #ifdef USE_RLE + case 33: ip += r; + if(--r >= 0xf) { + *op++ = 0xf0|8; + if(n <= 0x100) + *op++ = r; + else + vbput(op, r); + } else *op++ = r<<4|8; + vbput(op, ip[0]); + break; + #endif + } + } + return op; +} + +#define OP(__x) op[__x] // *op++ // +#define OPI(__x) op+=__x// // + +unsigned char *TEMPLATE2(vsdec, USIZE)(unsigned char *__restrict__ ip, int n, uint_t *__restrict__ op) { unsigned *op_=op+n; + while(op < op_) { register uint64_t w=*(uint64_t *)ip; + switch(w & 15) { + case 0: { + int r = (w>>4)&0xf; ip++; + if(unlikely(r == 0xf)) { + if(n <= 0x100) + r = (w>>8)&0xff, ip++; + else + r = vbget(ip); + } + uint_t *q=op; op+=r+1; while(q=0) *op++=0; + } break; + case 1: + OP( 0) = (w >> 4) & 1; + OP( 1) = (w >> 5) & 1; + OP( 2) = (w >> 6) & 1; + OP( 3) = (w >> 7) & 1; + OP( 4) = (w >> 8) & 1; + OP( 5) = (w >> 9) & 1; + OP( 6) = (w >> 10) & 1; + OP( 7) = (w >> 11) & 1; + OP( 8) = (w >> 12) & 1; + OP( 9) = (w >> 13) & 1; + OP(10) = (w >> 14) & 1; + OP(11) = (w >> 15) & 1; + OP(12) = (w >> 16) & 1; + OP(13) = (w >> 17) & 1; + OP(14) = (w >> 18) & 1; + OP(15) = (w >> 19) & 1; + OP(16) = (w >> 20) & 1; + OP(17) = (w >> 21) & 1; + OP(18) = (w >> 22) & 1; + OP(19) = (w >> 23) & 1; + OP(20) = (w >> 24) & 1; + OP(21) = (w >> 25) & 1; + OP(22) = (w >> 26) & 1; + OP(23) = (w >> 27) & 1; + OP(24) = (w >> 28) & 1; + OP(25) = (w >> 29) & 1; + OP(26) = (w >> 30) & 1; + OP(27) = (w >> 31) & 1; OPI( 28); ip+=4; + break; + case 2: + OP( 0) = (w >> 4) & 3; + OP( 1) = (w >> 6) & 3; + OP( 2) = (w >> 8) & 3; + OP( 3) = (w >> 10) & 3; + OP( 4) = (w >> 12) & 3; + OP( 5) = (w >> 14) & 3; + OP( 6) = (w >> 16) & 3; + OP( 7) = (w >> 18) & 3; + OP( 8) = (w >> 20) & 3; + OP( 9) = (w >> 22) & 3; + OP(10) = (w >> 24) & 3; + OP(11) = (w >> 26) & 3; + OP(12) = (w >> 28) & 3; + OP(13) = (w >> 30) & 3; OPI( 14); ip+=4; + break; + case 3: + OP( 0) = (w >> 4) & 7; + OP( 1) = (w >> 7) & 7; + OP( 2) = (w >> 10) & 7; + OP( 3) = (w >> 13) & 7; + OP( 4) = (w >> 16) & 7; + OP( 5) = (w >> 19) & 7; + OP( 6) = (w >> 22) & 7; + OP( 7) = (w >> 25) & 7; + OP( 8) = (w >> 28) & 7; OPI( 9); ip+=4; + break; + case 4: + OP( 0) = (w >> 4) & 0xf; + OP( 1) = (w >> 8) & 0xf; + OP( 2) = (w >> 12) & 0xf; + OP( 3) = (w >> 16) & 0xf; + OP( 4) = (w >> 20) & 0xf; + OP( 5) = (w >> 24) & 0xf; + OP( 6) = (w >> 28) & 0xf; OPI( 7); ip+=4; + break; + case 5: + OP( 0) = (w >> 4) & 0x1f; + OP( 1) = (w >> 9) & 0x1f; + OP( 2) = (w >> 14) & 0x1f; + OP( 3) = (w >> 19) & 0x1f; + OP( 4) = (w >> 24) & 0x1f; + OP( 5) = (w >> 29) & 0x1f; + OP( 6) = (w >> 34) & 0x1f; OPI( 7); ip+=5; + break; + case 6: + OP(0) = (w >> 4) & 0x3f; + OP(1) = (w >> 10) & 0x3f; + OP(2) = (w >> 16) & 0x3f; + OP(3) = (w >> 22) & 0x3f; + OP(4) = (w >> 28) & 0x3f; + OP(5) = (w >> 34) & 0x3f; OPI( 6); ip+=5; + break; + + case 7: + OP(0) = (w >> 4) & 0x7f; + OP(1) = (w >> 11) & 0x7f; + OP(2) = (w >> 18) & 0x7f; + OP(3) = (w >> 25) & 0x7f; + OP(4) = (w >> 32) & 0x7f; OPI( 5); ip+=5; + break; + + #ifdef USE_RLE + case 8: { + int r = (w>>4)&0xf; ip++; + if(unlikely(r == 0xf)) { + if(n <= 0x100) + r = (w>>8)&0xff, ip++; + else + r = vbget(ip); + } + unsigned u = vbget(ip); uint_t *q=op; op+=r+1; while(q=0) *op++=u; + } break; + #endif + case 9: + OP(0) = (w >> 4) & 0x1ff; + OP(1) = (w >> 13) & 0x1ff; + OP(2) = (w >> 22) & 0x1ff; + OP(3) = (w >> 31) & 0x1ff; OPI( 4); ip+=5; + break; + + case 10: + OP(0) = (w >> 4) & 0x3ff; + OP(1) = (w >> 14) & 0x3ff; + OP(2) = (w >> 24) & 0x3ff; + OP(3) = (w >> 34) & 0x3ff; + OP(4) = (w >> 44) & 0x3ff; + OP(5) = (w >> 54) & 0x3ff; OPI( 6); ip+=8; + break; + case 12: + OP(0) = (w >> 4) & 0xfff; + OP(1) = (w >> 16) & 0xfff; + OP(2) = (w >> 28) & 0xfff; + OP(3) = (w >> 40) & 0xfff; + OP(4) = (w >> 52) & 0xfff; OPI( 5); ip+=8; + break; + case 15: + OP(0) = (w >> 4) & 0x7fff; + OP(1) = (w >> 19) & 0x7fff; + OP(2) = (w >> 34) & 0x7fff; + OP(3) = (w >> 49) & 0x7fff; OPI( 4); ip+=8; + break; + case 11: + OP(0) = (w >> 4) & 0xfffff; // 20 + OP(1) = (w >> 24) & 0xfffff; + OP(2) = (w >> 44) & 0xfffff; OPI( 3); ip+=8; + break; + case 13: + OP(0) = (w >> 4) & ((1<<30)-1); + OP(1) = (w >> 34) & ((1<<30)-1); OPI( 2); ip+=8; + break; + case 14: + OP(0) = (w >> 4) & ((1ull<<32)-1); OPI( 1); ip+=5; + break; + } + } + return ip; +} +