From 56e9129b6362e8c5cc71110087bcc401eb3dd082 Mon Sep 17 00:00:00 2001 From: powturbo Date: Sat, 13 Nov 2021 19:01:32 +0100 Subject: [PATCH] TurboPFor: Bit Pack include --- bitpack_.h | 1612 ++++++++++++++++++++++++++-------------------------- 1 file changed, 806 insertions(+), 806 deletions(-) diff --git a/bitpack_.h b/bitpack_.h index 32f56cb..d20cf9f 100644 --- a/bitpack_.h +++ b/bitpack_.h @@ -2802,21 +2802,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con //--------------------------------------------------------- #define BITBLK128V16_1(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 1, iv), 1));\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 2, iv), 2));\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 3, iv), 3));\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 4, iv), 4));\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 5, iv), 5));\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 6, iv), 6));\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 7, iv), 7));\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 9, iv), 9));\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+10, iv), 10));\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+11, iv), 11));\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+12, iv), 12));\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+13, iv), 13));\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+14, iv), 14));\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 15)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 1, iv), 1));\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 2, iv), 2));\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 3, iv), 3));\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 4, iv), 4));\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 5, iv), 5));\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 6, iv), 6));\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 7, iv), 7));\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 9, iv), 9));\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+10, iv), 10));\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+11, iv), 11));\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+12, iv), 12));\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+13, iv), 13));\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+14, iv), 14));\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 15)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_1(ip, op, parm) {\ @@ -2825,13 +2825,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_2(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*8+ 0, iv, parm); ov = IP16(ip, i*8+ 0, iv);\ - VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 1, iv), 2));\ - VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 2, iv), 4));\ - VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 3, iv), 6));\ - VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 4, iv), 8));\ - VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 5, iv), 10));\ - VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 6, iv), 12));\ - VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 7, iv), 14)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 1, iv), 2));\ + VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 2, iv), 4));\ + VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 3, iv), 6));\ + VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 4, iv), 8));\ + VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 5, iv), 10));\ + VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 6, iv), 12));\ + VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 7, iv), 14)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_2(ip, op, parm) {\ @@ -2841,21 +2841,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_3(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 1, iv), 3));\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 2, iv), 6));\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 3, iv), 9));\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 4, iv), 12));\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 6, iv), 2));\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 7, iv), 5));\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 9, iv), 11));\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+11, iv), 1));\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+12, iv), 4));\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+13, iv), 7));\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+14, iv), 10));\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 13)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 1, iv), 3));\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 2, iv), 6));\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 3, iv), 9));\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 4, iv), 12));\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 6, iv), 2));\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 7, iv), 5));\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 9, iv), 11));\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+11, iv), 1));\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+12, iv), 4));\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+13, iv), 7));\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+14, iv), 10));\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 13)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_3(ip, op, parm) {\ @@ -2864,9 +2864,9 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_4(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*4+ 0, iv, parm); ov = IP16(ip, i*4+ 0, iv);\ - VI16(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*4+ 1, iv), 4));\ - VI16(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*4+ 2, iv), 8));\ - VI16(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*4+ 3, iv), 12)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*4+ 1, iv), 4));\ + VI16(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*4+ 2, iv), 8));\ + VI16(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*4+ 3, iv), 12)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_4(ip, op, parm) {\ @@ -2878,21 +2878,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_5(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 1, iv), 5));\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 2, iv), 10));\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 4, iv), 4));\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 5, iv), 9));\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 7, iv), 3));\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 3);\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+10, iv), 2));\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+11, iv), 7));\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+13, iv), 1));\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+14, iv), 6));\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 11)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 1, iv), 5));\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 2, iv), 10));\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 4, iv), 4));\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 5, iv), 9));\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 7, iv), 3));\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 3);\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+10, iv), 2));\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+11, iv), 7));\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+13, iv), 1));\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+14, iv), 6));\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 11)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_5(ip, op, parm) {\ @@ -2901,13 +2901,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_6(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*8+ 0, iv, parm); ov = IP16(ip, i*8+ 0, iv);\ - VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 1, iv), 6));\ - VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 3, iv), 2));\ - VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 4, iv), 8));\ - VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 6, iv), 4));\ - VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 7, iv), 10)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 1, iv), 6));\ + VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 3, iv), 2));\ + VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 4, iv), 8));\ + VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 6, iv), 4));\ + VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 7, iv), 10)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_6(ip, op, parm) {\ @@ -2917,21 +2917,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_7(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 1, iv), 7));\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 3, iv), 5));\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 5, iv), 3));\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 7, iv), 1));\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+10, iv), 6));\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 3);\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+12, iv), 4));\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 5);\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+14, iv), 2));\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 9)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 1, iv), 7));\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 3, iv), 5));\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 5, iv), 3));\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 7, iv), 1));\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 8, iv), 8));\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+10, iv), 6));\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 3);\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+12, iv), 4));\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 5);\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+14, iv), 2));\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 9)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_7(ip, op, parm) {\ @@ -2940,7 +2940,7 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_8(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*2+ 0, iv, parm); ov = IP16(ip, i*2+ 0, iv);\ - VI16(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*2+ 1, iv), 8)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*2+ 1, iv), 8)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_8(ip, op, parm) {\ @@ -2956,21 +2956,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_9(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 7);\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 2, iv), 2));\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 5);\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 4, iv), 4));\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 3);\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 6, iv), 6));\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 9, iv), 1));\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+11, iv), 3));\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+13, iv), 5));\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 7)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 7);\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 2, iv), 2));\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 5);\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 4, iv), 4));\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 3);\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 6, iv), 6));\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 9, iv), 1));\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+11, iv), 3));\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+13, iv), 5));\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 7)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_9(ip, op, parm) {\ @@ -2979,13 +2979,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_10(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*8+ 0, iv, parm); ov = IP16(ip, i*8+ 0, iv);\ - VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 1, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 2, iv), 4));\ - VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 5, iv), 2));\ - VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 6, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 7, iv), 6)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 1, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 2, iv), 4));\ + VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 5, iv), 2));\ + VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 6, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 7, iv), 6)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_10(ip, op, parm) {\ @@ -2995,21 +2995,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_11(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 5);\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 10);\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 3, iv), 1));\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 9);\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 6, iv), 2));\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 3);\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 9, iv), 3));\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 7);\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+12, iv), 4));\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 5)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 5);\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 10);\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 3, iv), 1));\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 9);\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 6, iv), 2));\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 3);\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 9, iv), 3));\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 7);\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+12, iv), 4));\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 5)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_11(ip, op, parm) {\ @@ -3018,9 +3018,9 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_12(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*4+ 0, iv, parm); ov = IP16(ip, i*4+ 0, iv);\ - VI16(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*4+ 1, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*4+ 2, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*4+ 3, iv), 4)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*4+ 1, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*4+ 2, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*4+ 3, iv), 4)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_12(ip, op, parm) {\ @@ -3032,21 +3032,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_13(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 3);\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 9);\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 12);\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+ 5, iv), 1));\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 5);\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 11);\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+10, iv), 2));\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 7);\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 10);\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 3)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 3);\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 7)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 9);\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 12);\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+ 5, iv), 1));\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 5);\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 11);\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+10, iv), 2));\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 7);\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 10);\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 3)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_13(ip, op, parm) {\ @@ -3055,13 +3055,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_14(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*8+ 0, iv, parm); ov = IP16(ip, i*8+ 0, iv);\ - VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 1, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 3, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 5, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 10);\ - VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*8+ 6, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 12);\ - VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*8+ 7, iv), 2)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 1, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 3, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 5, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 10);\ + VI16(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*8+ 6, iv), 4)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 12);\ + VI16(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*8+ 7, iv), 2)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_14(ip, op, parm) {\ @@ -3071,21 +3071,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V16_15(ip, i, op, parm) { __m128i ov,iv;\ VI16(ip, i*16+ 0, iv, parm); ov = IP16(ip, i*16+ 0, iv);\ - VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 1);\ - VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 2);\ - VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 3);\ - VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 4);\ - VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 5);\ - VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 6);\ - VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 7);\ - VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 8);\ - VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 9);\ - VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 10);\ - VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 11);\ - VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 12);\ - VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 13);\ - VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi16(iv, 14);\ - VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi16( IP16(ip, i*16+15, iv), 1)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI16(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 1, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 1);\ + VI16(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 2);\ + VI16(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 3, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 3);\ + VI16(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 4);\ + VI16(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 5, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 5);\ + VI16(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 6);\ + VI16(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 7, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 7);\ + VI16(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 8);\ + VI16(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+ 9, iv), 7)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 9);\ + VI16(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+10, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 10);\ + VI16(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+11, iv), 5)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 11);\ + VI16(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 12);\ + VI16(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+13, iv), 3)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 13);\ + VI16(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16(iv = IP16(ip, i*16+14, iv), 2)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi16(iv, 14);\ + VI16(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi16( IP16(ip, i*16+15, iv), 1)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V16_15(ip, op, parm) {\ @@ -3117,37 +3117,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_1(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 1));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 2));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 3));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 5));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 6));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 7));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 9));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 10));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 11));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 12));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 13));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 14));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 15));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 17));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 18));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 19));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 20));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 21));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 22));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 23));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 24));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 25));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 26));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 27));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 28));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 29));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 30));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 31)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 1));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 2));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 3));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 5));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 6));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 7));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 9));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 10));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 11));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 12));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 13));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 14));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 15));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 17));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 18));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 19));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 20));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 21));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 22));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 23));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 24));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 25));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 26));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 27));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 28));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 29));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 30));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 31)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_1(ip, op, parm) {\ @@ -3156,21 +3156,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_2(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 1, iv), 2));\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 2, iv), 4));\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 3, iv), 6));\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 4, iv), 8));\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 5, iv), 10));\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 6, iv), 12));\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 7, iv), 14));\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 9, iv), 18));\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+10, iv), 20));\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+11, iv), 22));\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+12, iv), 24));\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+13, iv), 26));\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+14, iv), 28));\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 30)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 1, iv), 2));\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 2, iv), 4));\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 3, iv), 6));\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 4, iv), 8));\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 5, iv), 10));\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 6, iv), 12));\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 7, iv), 14));\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 9, iv), 18));\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+10, iv), 20));\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+11, iv), 22));\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+12, iv), 24));\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+13, iv), 26));\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+14, iv), 28));\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 30)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_2(ip, op, parm) {\ @@ -3180,37 +3180,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_3(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 3));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 6));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 9));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 12));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 15));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 18));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 21));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 24));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 27));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 1));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 7));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 10));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 13));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 19));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 22));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 25));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 28));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 2));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 5));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 11));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 14));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 17));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 20));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 23));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 26));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 29)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 3));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 6));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 9));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 12));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 15));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 18));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 21));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 24));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 27));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 1));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 7));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 10));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 13));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 19));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 22));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 25));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 28));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 2));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 5));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 11));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 14));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 17));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 20));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 23));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 26));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 29)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_3(ip, op, parm) {\ @@ -3219,13 +3219,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_4(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*8+ 0, iv, parm); ov = IP32(ip, i*8+ 0, iv);\ - VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 1, iv), 4));\ - VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 2, iv), 8));\ - VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 3, iv), 12));\ - VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 4, iv), 16));\ - VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 5, iv), 20));\ - VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 6, iv), 24));\ - VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 7, iv), 28)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 1, iv), 4));\ + VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 2, iv), 8));\ + VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 3, iv), 12));\ + VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 4, iv), 16));\ + VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 5, iv), 20));\ + VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 6, iv), 24));\ + VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 7, iv), 28)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_4(ip, op, parm) {\ @@ -3237,37 +3237,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_5(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 5));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 10));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 15));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 20));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 25));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 3));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 13));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 18));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 23));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 1));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 6));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 11));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 21));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 26));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 4));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 9));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 14));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 19));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 24));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 2));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 7));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 12));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 17));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 22));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 27)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 5));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 10));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 15));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 20));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 25));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 3));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 13));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 18));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 23));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 1));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 6));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 11));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 21));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 26));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 4));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 9));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 14));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 19));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 24));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 2));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 7));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 12));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 17));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 22));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 27)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_5(ip, op, parm) {\ @@ -3276,21 +3276,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_6(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 1, iv), 6));\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 2, iv), 12));\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 3, iv), 18));\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 4, iv), 24));\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 6, iv), 4));\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 7, iv), 10));\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 9, iv), 22));\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+11, iv), 2));\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+12, iv), 8));\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+13, iv), 14));\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+14, iv), 20));\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 26)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 1, iv), 6));\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 2, iv), 12));\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 3, iv), 18));\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 4, iv), 24));\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 6, iv), 4));\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 7, iv), 10));\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 9, iv), 22));\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+11, iv), 2));\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+12, iv), 8));\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+13, iv), 14));\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+14, iv), 20));\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 26)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_6(ip, op, parm) {\ @@ -3300,37 +3300,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_7(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 7));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 14));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 21));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 3));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 10));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 17));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 24));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 6));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 13));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 20));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 2));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 9));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 23));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 5));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 12));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 19));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 1));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 15));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 22));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 4));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 11));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 18));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 25)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 7));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 14));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 21));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 3));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 10));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 17));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 24));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 6));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 13));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 20));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 2));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 9));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 23));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 5));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 12));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 19));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 1));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 15));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 22));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 4));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 11));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 18));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 25)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_7(ip, op, parm) {\ @@ -3339,9 +3339,9 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_8(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*4+ 0, iv, parm); ov = IP32(ip, i*4+ 0, iv);\ - VI32(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*4+ 1, iv), 8));\ - VI32(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*4+ 2, iv), 16));\ - VI32(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*4+ 3, iv), 24)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*4+ 1, iv), 8));\ + VI32(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*4+ 2, iv), 16));\ + VI32(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*4+ 3, iv), 24)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_8(ip, op, parm) {\ @@ -3357,37 +3357,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_9(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 9));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 18));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 13));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 22));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 17));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 3));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 12));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 21));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 7));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 2));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 11));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 20));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 6));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 15));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 1));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 10));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 19));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 5));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 14));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 23)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 9));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 18));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 13));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 22));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 17));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 3));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 12));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 21));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 7));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 2));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 11));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 20));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 6));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 15));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 1));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 10));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 19));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 5));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 14));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 23)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_9(ip, op, parm) {\ @@ -3396,21 +3396,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_10(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 1, iv), 10));\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 2, iv), 20));\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 4, iv), 8));\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 5, iv), 18));\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 7, iv), 6));\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+10, iv), 4));\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+11, iv), 14));\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+13, iv), 2));\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+14, iv), 12));\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 22)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 1, iv), 10));\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 2, iv), 20));\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 4, iv), 8));\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 5, iv), 18));\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 7, iv), 6));\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+10, iv), 4));\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+11, iv), 14));\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+13, iv), 2));\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+14, iv), 12));\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 22)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_10(ip, op, parm) {\ @@ -3420,37 +3420,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_11(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 11));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 1));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 12));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 2));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 13));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 3));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 14));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 15));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 5));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 6));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 17));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 7));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 18));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 19));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 9));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 20));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 10));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 21)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 11));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 1));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 12));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 2));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 13));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 3));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 14));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 15));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 5));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 6));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 17));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 7));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 18));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 19));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 9));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 20));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 10));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 21)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_11(ip, op, parm) {\ @@ -3459,13 +3459,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_12(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*8+ 0, iv, parm); ov = IP32(ip, i*8+ 0, iv);\ - VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 1, iv), 12));\ - VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 3, iv), 4));\ - VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 4, iv), 16));\ - VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 5, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 6, iv), 8));\ - VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 7, iv), 20)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 1, iv), 12));\ + VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 3, iv), 4));\ + VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 4, iv), 16));\ + VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 5, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 6, iv), 8));\ + VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 7, iv), 20)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_12(ip, op, parm) {\ @@ -3477,37 +3477,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_13(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 13));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 7));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 1));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 14));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 2));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 15));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 9));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 3));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 10));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 4));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 17));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 11));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 5));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 18));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 12));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 6));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 19)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 13));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 7));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 1));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 14));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 2));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 15));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 9));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 3));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 10));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 4));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 17));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 11));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 5));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 18));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 12));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 6));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 19)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_13(ip, op, parm) {\ @@ -3516,21 +3516,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_14(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 1, iv), 14));\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 3, iv), 10));\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 5, iv), 6));\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 7, iv), 2));\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+10, iv), 12));\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+12, iv), 8));\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+14, iv), 4));\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 18)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 1, iv), 14));\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 3, iv), 10));\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 5, iv), 6));\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 7, iv), 2));\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 8, iv), 16));\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+10, iv), 12));\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+12, iv), 8));\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+14, iv), 4));\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 18)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_14(ip, op, parm) {\ @@ -3540,37 +3540,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_15(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 1, iv), 15));\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 13));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 11));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 9));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 7));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 5));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 3));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+15, iv), 1));\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 14));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 12));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 10));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 6));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 4));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+30, iv), 2));\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 17)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 1, iv), 15));\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 13));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 11));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 9));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 7));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 5));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 3));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+15, iv), 1));\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+16, iv), 16));\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 14));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 12));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 10));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 6));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 4));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+30, iv), 2));\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 17)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_15(ip, op, parm) {\ @@ -3579,7 +3579,7 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_16(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*2+ 0, iv, parm); ov = IP32(ip, i*2+ 0, iv);\ - VI32(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*2+ 1, iv), 16)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*2+ 1, iv), 16)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_16(ip, op, parm) {\ @@ -3603,37 +3603,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_17(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 2));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 6));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 10));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 12));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 14));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 1));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 3));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 5));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 7));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 9));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 11));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 13));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 15)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 2));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 6));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 10));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 12));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 14));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 1));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 3));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 5));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 7));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 9));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 11));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 13));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 15)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_17(ip, op, parm) {\ @@ -3642,21 +3642,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_18(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 2, iv), 4));\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 4, iv), 8));\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 6, iv), 12));\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 9, iv), 2));\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+11, iv), 6));\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+13, iv), 10));\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 14)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 2, iv), 4));\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 4, iv), 8));\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 6, iv), 12));\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 9, iv), 2));\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+11, iv), 6));\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+13, iv), 10));\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 14)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_18(ip, op, parm) {\ @@ -3666,37 +3666,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_19(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 6));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 12));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 5));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 11));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 10));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 3));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 9));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 2));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 1));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 7));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 13)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 6));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 12));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 5));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 11));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 10));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 3));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 9));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 2));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 1));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 7));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 13)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_19(ip, op, parm) {\ @@ -3705,13 +3705,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_20(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*8+ 0, iv, parm); ov = IP32(ip, i*8+ 0, iv);\ - VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 1, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 2, iv), 8));\ - VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 3, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 5, iv), 4));\ - VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 7, iv), 12)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 1, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 2, iv), 8));\ + VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 3, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 5, iv), 4));\ + VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 7, iv), 12)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_20(ip, op, parm) {\ @@ -3723,37 +3723,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_21(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 2, iv), 10));\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 5, iv), 9));\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+11, iv), 7));\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 6));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 5));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+20, iv), 4));\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+23, iv), 3));\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+26, iv), 2));\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+29, iv), 1));\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 11)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 2, iv), 10));\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 5, iv), 9));\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 8, iv), 8));\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+11, iv), 7));\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 6));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 5));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+20, iv), 4));\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+23, iv), 3));\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 19);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+26, iv), 2));\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+29, iv), 1));\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 11)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_21(ip, op, parm) {\ @@ -3762,21 +3762,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_22(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 3, iv), 2));\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 6, iv), 4));\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 9, iv), 6));\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+12, iv), 8));\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 10)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 3, iv), 2));\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 6, iv), 4));\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 9, iv), 6));\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+12, iv), 8));\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 10)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_22(ip, op, parm) {\ @@ -3786,37 +3786,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_23(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 3, iv), 5));\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 7, iv), 1));\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 6));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+14, iv), 2));\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+17, iv), 7));\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 3));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+28, iv), 4));\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 9)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 3, iv), 5));\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 7, iv), 1));\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 6));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 21);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+14, iv), 2));\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+17, iv), 7));\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 3));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+24, iv), 8));\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 19);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+28, iv), 4));\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 9)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_23(ip, op, parm) {\ @@ -3825,9 +3825,9 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_24(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*4+ 0, iv, parm); ov = IP32(ip, i*4+ 0, iv);\ - VI32(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*4+ 1, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*4+ 3, iv), 8)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*4+ 1, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*4+ 3, iv), 8)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_24(ip, op, parm) {\ @@ -3843,37 +3843,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_25(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 9, iv), 1));\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+13, iv), 5));\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+18, iv), 2));\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+22, iv), 6));\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+27, iv), 3));\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 7)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 21);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 4, iv), 4));\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 9, iv), 1));\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+13, iv), 5));\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 23);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+18, iv), 2));\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 19);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+22, iv), 6));\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+27, iv), 3));\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 7)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_25(ip, op, parm) {\ @@ -3882,21 +3882,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_26(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+ 5, iv), 2));\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+10, iv), 4));\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 6)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+ 5, iv), 2));\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+10, iv), 4));\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 6)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_26(ip, op, parm) {\ @@ -3906,37 +3906,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_27(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+ 6, iv), 2));\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+19, iv), 1));\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+25, iv), 3));\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 5)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 25);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+ 6, iv), 2));\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 23);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+12, iv), 4));\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 21);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 26);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+19, iv), 1));\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 19);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+25, iv), 3));\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 5)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_27(ip, op, parm) {\ @@ -3945,13 +3945,13 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_28(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*8+ 0, iv, parm); ov = IP32(ip, i*8+ 0, iv);\ - VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 1, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 3, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 5, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*8+ 7, iv), 4)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 1, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 3, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 5, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*8+ 7, iv), 4)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_28(ip, op, parm) {\ @@ -3963,37 +3963,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_29(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+10, iv), 2));\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+21, iv), 1));\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 3)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 21);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 27);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+10, iv), 2));\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 19);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 7)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 25);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 28);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+21, iv), 1));\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 23);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 26);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 3)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_29(ip, op, parm) {\ @@ -4002,21 +4002,21 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_30(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*16+ 0, iv, parm); ov = IP32(ip, i*16+ 0, iv);\ - VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*16+15, iv), 2)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 1, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 3, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 5, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 7, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+ 9, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+11, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+13, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 26);\ + VI32(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 28);\ + VI32(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*16+15, iv), 2)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_30(ip, op, parm) {\ @@ -4026,37 +4026,37 @@ unsigned char *TEMPLATE2(_BITPACK_,64)( uint64_t *__restrict in, unsigned n, con #define BITBLK128V32_31(ip, i, op, parm) { __m128i ov,iv;\ VI32(ip, i*32+ 0, iv, parm); ov = IP32(ip, i*32+ 0, iv);\ - VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IP32(ip, i*32+31, iv), 1)); _mm_storeu_si128((__m128i *)op++, ov);\ + VI32(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 1, iv), 31)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 1);\ + VI32(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 2);\ + VI32(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 3, iv), 29)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 3);\ + VI32(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 4);\ + VI32(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 5, iv), 27)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 5);\ + VI32(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 6);\ + VI32(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 7, iv), 25)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 7);\ + VI32(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 8);\ + VI32(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+ 9, iv), 23)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 9);\ + VI32(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 10);\ + VI32(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+11, iv), 21)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 11);\ + VI32(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 12);\ + VI32(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+13, iv), 19)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 13);\ + VI32(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 14);\ + VI32(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+15, iv), 17)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 15);\ + VI32(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 16);\ + VI32(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+17, iv), 15)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 17);\ + VI32(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 18);\ + VI32(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+19, iv), 13)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 19);\ + VI32(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 20);\ + VI32(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+21, iv), 11)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 21);\ + VI32(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 22);\ + VI32(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+23, iv), 9)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 23);\ + VI32(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 24);\ + VI32(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+25, iv), 7)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 25);\ + VI32(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 26);\ + VI32(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+27, iv), 5)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 27);\ + VI32(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 28);\ + VI32(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+29, iv), 3)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 29);\ + VI32(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32(iv = IP32(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = mm_srli_epi32(iv, 30);\ + VI32(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, mm_slli_epi32( IP32(ip, i*32+31, iv), 1)); _mm_storeu_si128((__m128i *)op++, ov);\ } #define BITPACK128V32_31(ip, op, parm) {\