diff --git a/bitunpackv32_.h b/bitunpackv32_.h new file mode 100644 index 0000000..caf5762 --- /dev/null +++ b/bitunpackv32_.h @@ -0,0 +1,1026 @@ +/*#define BITUNBLKV32_0(ip, i, op, parm) {\ + OPU(op,i*0+ 0, 0, parm);\ + OPU(op,i*0+ 1, 0, parm);\ + OPU(op,i*0+ 2, 0, parm);\ + OPU(op,i*0+ 3, 0, parm);\ + OPU(op,i*0+ 4, 0, parm);\ + OPU(op,i*0+ 5, 0, parm);\ + OPU(op,i*0+ 6, 0, parm);\ + OPU(op,i*0+ 7, 0, parm);\ + OPU(op,i*0+ 8, 0, parm);\ + OPU(op,i*0+ 9, 0, parm);\ + OPU(op,i*0+10, 0, parm);\ + OPU(op,i*0+11, 0, parm);\ + OPU(op,i*0+12, 0, parm);\ + OPU(op,i*0+13, 0, parm);\ + OPU(op,i*0+14, 0, parm);\ + OPU(op,i*0+15, 0, parm);\ + OPU(op,i*0+16, 0, parm);\ + OPU(op,i*0+17, 0, parm);\ + OPU(op,i*0+18, 0, parm);\ + OPU(op,i*0+19, 0, parm);\ + OPU(op,i*0+20, 0, parm);\ + OPU(op,i*0+21, 0, parm);\ + OPU(op,i*0+22, 0, parm);\ + OPU(op,i*0+23, 0, parm);\ + OPU(op,i*0+24, 0, parm);\ + OPU(op,i*0+25, 0, parm);\ + OPU(op,i*0+26, 0, parm);\ + OPU(op,i*0+27, 0, parm);\ + OPU(op,i*0+28, 0, parm);\ + OPU(op,i*0+29, 0, parm);\ + OPU(op,i*0+30, 0, parm);\ + OPU(op,i*0+31, 0, parm);;\ +}*/ + +#define BITUNPACKV32_0(ip, op, parm) {\ + BITUNBLKV32_0(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_1(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 25), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 27), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 28), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 29), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 30), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_1(ip, op, parm) {\ + BITUNBLKV32_1(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_2(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26), mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 28), mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_2(ip, op, parm) {\ + BITUNBLKV32_2(ip, 0, op, parm);\ + BITUNBLKV32_2(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_3(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 27), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 25), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 28), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_3(ip, op, parm) {\ + BITUNBLKV32_3(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_4(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*8+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*8+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*8+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*8+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*8+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*8+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACKV32_4(ip, op, parm) {\ + BITUNBLKV32_4(ip, 0, op, parm);\ + BITUNBLKV32_4(ip, 1, op, parm);\ + BITUNBLKV32_4(ip, 2, op, parm);\ + BITUNBLKV32_4(ip, 3, op, parm);\ +} + +#define BITUNBLKV32_5(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 25), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 26), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_5(ip, op, parm) {\ + BITUNBLKV32_5(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_6(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_6(ip, op, parm) {\ + BITUNBLKV32_6(ip, 0, op, parm);\ + BITUNBLKV32_6(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_7(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 24), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 23), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_7(ip, op, parm) {\ + BITUNBLKV32_7(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_8(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*4+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*4+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*4+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); VSTO(op,i*4+ 3,ov,parm); ;\ +} + +#define BITUNPACKV32_8(ip, op, parm) {\ + BITUNBLKV32_8(ip, 0, op, parm);\ + BITUNBLKV32_8(ip, 1, op, parm);\ + BITUNBLKV32_8(ip, 2, op, parm);\ + BITUNBLKV32_8(ip, 3, op, parm);\ + BITUNBLKV32_8(ip, 4, op, parm);\ + BITUNBLKV32_8(ip, 5, op, parm);\ + BITUNBLKV32_8(ip, 6, op, parm);\ + BITUNBLKV32_8(ip, 7, op, parm);\ +} + +#define BITUNBLKV32_9(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 22), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 21), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_9(ip, op, parm) {\ + BITUNBLKV32_9(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_10(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_10(ip, op, parm) {\ + BITUNBLKV32_10(ip, 0, op, parm);\ + BITUNBLKV32_10(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_11(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 19), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 20), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_11(ip, op, parm) {\ + BITUNBLKV32_11(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_12(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*8+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*8+ 3,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*8+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*8+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACKV32_12(ip, op, parm) {\ + BITUNBLKV32_12(ip, 0, op, parm);\ + BITUNBLKV32_12(ip, 1, op, parm);\ + BITUNBLKV32_12(ip, 2, op, parm);\ + BITUNBLKV32_12(ip, 3, op, parm);\ +} + +#define BITUNBLKV32_13(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 17), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 18), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_13(ip, op, parm) {\ + BITUNBLKV32_13(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_14(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*16+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+ 7,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*16+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_14(ip, op, parm) {\ + BITUNBLKV32_14(ip, 0, op, parm);\ + BITUNBLKV32_14(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_15(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 15), mv); VSTO(op,i*32+ 1,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+15,ov,parm); \ + ov = _mm_and_si128(_mm_srli_epi32(iv, 16), mv); VSTO(op,i*32+16,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+30,ov,parm); \ + ov = _mm_srli_epi32(iv, 17); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_15(ip, op, parm) {\ + BITUNBLKV32_15(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_16(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*2+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 16); VSTO(op,i*2+ 1,ov,parm); ;\ +} + +#define BITUNPACKV32_16(ip, op, parm) {\ + BITUNBLKV32_16(ip, 0, op, parm);\ + BITUNBLKV32_16(ip, 1, op, parm);\ + BITUNBLKV32_16(ip, 2, op, parm);\ + BITUNBLKV32_16(ip, 3, op, parm);\ + BITUNBLKV32_16(ip, 4, op, parm);\ + BITUNBLKV32_16(ip, 5, op, parm);\ + BITUNBLKV32_16(ip, 6, op, parm);\ + BITUNBLKV32_16(ip, 7, op, parm);\ + BITUNBLKV32_16(ip, 8, op, parm);\ + BITUNBLKV32_16(ip, 9, op, parm);\ + BITUNBLKV32_16(ip, 10, op, parm);\ + BITUNBLKV32_16(ip, 11, op, parm);\ + BITUNBLKV32_16(ip, 12, op, parm);\ + BITUNBLKV32_16(ip, 13, op, parm);\ + BITUNBLKV32_16(ip, 14, op, parm);\ + BITUNBLKV32_16(ip, 15, op, parm);\ +} + +#define BITUNBLKV32_17(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 14), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 13), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_17(ip, op, parm) {\ + BITUNBLKV32_17(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_18(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*16+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*16+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*16+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_18(ip, op, parm) {\ + BITUNBLKV32_18(ip, 0, op, parm);\ + BITUNBLKV32_18(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_19(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 12), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 11), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_19(ip, op, parm) {\ + BITUNBLKV32_19(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_20(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*8+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*8+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACKV32_20(ip, op, parm) {\ + BITUNBLKV32_20(ip, 0, op, parm);\ + BITUNBLKV32_20(ip, 1, op, parm);\ + BITUNBLKV32_20(ip, 2, op, parm);\ + BITUNBLKV32_20(ip, 3, op, parm);\ +} + +#define BITUNBLKV32_21(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 10), mv); VSTO(op,i*32+ 2,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 9), mv); VSTO(op,i*32+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+ 8,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+11,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+20,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+23,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+26,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+29,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_21(ip, op, parm) {\ + BITUNBLKV32_21(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_22(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*16+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*16+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_22(ip, op, parm) {\ + BITUNBLKV32_22(ip, 0, op, parm);\ + BITUNBLKV32_22(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_23(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+ 3,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+ 7,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+14,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 7), mv); VSTO(op,i*32+17,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 8), mv); VSTO(op,i*32+24,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+28,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_23(ip, op, parm) {\ + BITUNBLKV32_23(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_24(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*4+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); VSTO(op,i*4+ 3,ov,parm); ;\ +} + +#define BITUNPACKV32_24(ip, op, parm) {\ + BITUNBLKV32_24(ip, 0, op, parm);\ + BITUNBLKV32_24(ip, 1, op, parm);\ + BITUNBLKV32_24(ip, 2, op, parm);\ + BITUNBLKV32_24(ip, 3, op, parm);\ + BITUNBLKV32_24(ip, 4, op, parm);\ + BITUNBLKV32_24(ip, 5, op, parm);\ + BITUNBLKV32_24(ip, 6, op, parm);\ + BITUNBLKV32_24(ip, 7, op, parm);\ +} + +#define BITUNBLKV32_25(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+ 4,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+ 9,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 5), mv); VSTO(op,i*32+13,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+18,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 6), mv); VSTO(op,i*32+22,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+27,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_25(ip, op, parm) {\ + BITUNBLKV32_25(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_26(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*16+ 5,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*16+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_26(ip, op, parm) {\ + BITUNBLKV32_26(ip, 0, op, parm);\ + BITUNBLKV32_26(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_27(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+ 6,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 4), mv); VSTO(op,i*32+12,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+19,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 3), mv); VSTO(op,i*32+25,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_27(ip, op, parm) {\ + BITUNBLKV32_27(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_28(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*8+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); VSTO(op,i*8+ 7,ov,parm); ;\ +} + +#define BITUNPACKV32_28(ip, op, parm) {\ + BITUNBLKV32_28(ip, 0, op, parm);\ + BITUNBLKV32_28(ip, 1, op, parm);\ + BITUNBLKV32_28(ip, 2, op, parm);\ + BITUNBLKV32_28(ip, 3, op, parm);\ +} + +#define BITUNBLKV32_29(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 2), mv); VSTO(op,i*32+10,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_and_si128(_mm_srli_epi32(iv, 1), mv); VSTO(op,i*32+21,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_29(ip, op, parm) {\ + BITUNBLKV32_29(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_30(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*16+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); VSTO(op,i*16+15,ov,parm); ;\ +} + +#define BITUNPACKV32_30(ip, op, parm) {\ + BITUNBLKV32_30(ip, 0, op, parm);\ + BITUNBLKV32_30(ip, 1, op, parm);\ +} + +#define BITUNBLKV32_31(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*32+ 0,ov,parm); \ + ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ + ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ + ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ + ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ + ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ + ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ + ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ + ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ + ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ + ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ + ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ + ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ + ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ + ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ + ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ + ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ + ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ + ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ + ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ + ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ + ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ + ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ + ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ + ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ + ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ + ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ + ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ + ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ + ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ + ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ + ov = _mm_srli_epi32(iv, 1); VSTO(op,i*32+31,ov,parm); ;\ +} + +#define BITUNPACKV32_31(ip, op, parm) {\ + BITUNBLKV32_31(ip, 0, op, parm);\ +} + +#define BITUNBLKV32_32(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ + ov = _mm_and_si128( iv , mv); VSTO(op,i*1+ 0,ov,parm); ;\ +} + +#define BITUNPACKV32_32(ip, op, parm) {\ + BITUNBLKV32_32(ip, 0, op, parm);\ + BITUNBLKV32_32(ip, 1, op, parm);\ + BITUNBLKV32_32(ip, 2, op, parm);\ + BITUNBLKV32_32(ip, 3, op, parm);\ + BITUNBLKV32_32(ip, 4, op, parm);\ + BITUNBLKV32_32(ip, 5, op, parm);\ + BITUNBLKV32_32(ip, 6, op, parm);\ + BITUNBLKV32_32(ip, 7, op, parm);\ + BITUNBLKV32_32(ip, 8, op, parm);\ + BITUNBLKV32_32(ip, 9, op, parm);\ + BITUNBLKV32_32(ip, 10, op, parm);\ + BITUNBLKV32_32(ip, 11, op, parm);\ + BITUNBLKV32_32(ip, 12, op, parm);\ + BITUNBLKV32_32(ip, 13, op, parm);\ + BITUNBLKV32_32(ip, 14, op, parm);\ + BITUNBLKV32_32(ip, 15, op, parm);\ + BITUNBLKV32_32(ip, 16, op, parm);\ + BITUNBLKV32_32(ip, 17, op, parm);\ + BITUNBLKV32_32(ip, 18, op, parm);\ + BITUNBLKV32_32(ip, 19, op, parm);\ + BITUNBLKV32_32(ip, 20, op, parm);\ + BITUNBLKV32_32(ip, 21, op, parm);\ + BITUNBLKV32_32(ip, 22, op, parm);\ + BITUNBLKV32_32(ip, 23, op, parm);\ + BITUNBLKV32_32(ip, 24, op, parm);\ + BITUNBLKV32_32(ip, 25, op, parm);\ + BITUNBLKV32_32(ip, 26, op, parm);\ + BITUNBLKV32_32(ip, 27, op, parm);\ + BITUNBLKV32_32(ip, 28, op, parm);\ + BITUNBLKV32_32(ip, 29, op, parm);\ + BITUNBLKV32_32(ip, 30, op, parm);\ + BITUNBLKV32_32(ip, 31, op, parm);\ +} +