diff --git a/bitpack128v_.h b/bitpack128v_.h deleted file mode 100644 index 71db5f3..0000000 --- a/bitpack128v_.h +++ /dev/null @@ -1,2036 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// TurboPFor: Integer Compression SIMD bit packing -#define BITBLK128V32_1(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 1));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 3));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 5));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 7));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 9));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 10));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 11));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 12));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 13));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 14));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 15));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 17));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 18));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 19));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 20));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 21));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 22));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 23));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 24));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 25));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 26));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 27));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 28));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 29));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 30));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 31)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_1(ip, op, parm) {\ - BITBLK128V32_1(ip, 0, op, parm); IPPE(ip); OPPE(op += 1*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_2(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 2));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 6));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 10));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 14));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 18));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 20));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 22));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 24));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 26));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 28));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 30)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_2(ip, op, parm) {\ - BITBLK128V32_2(ip, 0, op, parm);\ - BITBLK128V32_2(ip, 1, op, parm); IPPE(ip); OPPE(op += 2*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_3(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 3));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 9));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 15));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 18));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 21));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 27));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 1));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 7));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 10));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 13));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 19));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 22));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 25));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 28));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 2));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 5));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 11));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 14));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 17));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 20));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 23));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 26));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 29)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_3(ip, op, parm) {\ - BITBLK128V32_3(ip, 0, op, parm); IPPE(ip); OPPE(op += 3*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_4(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 1, iv), 4));\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 3, iv), 12));\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 5, iv), 20));\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 6, iv), 24));\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 28)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_4(ip, op, parm) {\ - BITBLK128V32_4(ip, 0, op, parm);\ - BITBLK128V32_4(ip, 1, op, parm);\ - BITBLK128V32_4(ip, 2, op, parm);\ - BITBLK128V32_4(ip, 3, op, parm); IPPE(ip); OPPE(op += 4*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_5(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 5));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 15));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 20));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 25));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 3));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 13));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 18));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 23));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 1));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 6));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 11));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 21));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 26));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 4));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 9));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 14));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 19));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 24));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 2));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 7));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 12));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 17));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 22));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 27)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_5(ip, op, parm) {\ - BITBLK128V32_5(ip, 0, op, parm); IPPE(ip); OPPE(op += 5*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_6(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 6));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 12));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 18));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 24));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 10));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 22));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 2));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 8));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 14));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 20));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 26)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_6(ip, op, parm) {\ - BITBLK128V32_6(ip, 0, op, parm);\ - BITBLK128V32_6(ip, 1, op, parm); IPPE(ip); OPPE(op += 6*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_7(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 7));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 14));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 21));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 3));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 10));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 17));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 6));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 13));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 20));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 2));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 9));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 23));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 5));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 12));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 19));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 1));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 15));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 22));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 4));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 11));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 18));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 25)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_7(ip, op, parm) {\ - BITBLK128V32_7(ip, 0, op, parm); IPPE(ip); OPPE(op += 7*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_8(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 1, iv), 8));\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 2, iv), 16));\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 3, iv), 24)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_8(ip, op, parm) {\ - BITBLK128V32_8(ip, 0, op, parm);\ - BITBLK128V32_8(ip, 1, op, parm);\ - BITBLK128V32_8(ip, 2, op, parm);\ - BITBLK128V32_8(ip, 3, op, parm);\ - BITBLK128V32_8(ip, 4, op, parm);\ - BITBLK128V32_8(ip, 5, op, parm);\ - BITBLK128V32_8(ip, 6, op, parm);\ - BITBLK128V32_8(ip, 7, op, parm); IPPE(ip); OPPE(op += 8*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_9(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 9));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 18));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 13));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 22));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 17));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 3));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 12));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 21));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 7));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 2));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 11));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 20));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 6));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 15));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 1));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 10));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 19));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 5));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 14));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 23)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_9(ip, op, parm) {\ - BITBLK128V32_9(ip, 0, op, parm); IPPE(ip); OPPE(op += 9*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_10(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 10));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 20));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 18));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 6));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 4));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 14));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 2));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 12));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 22)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_10(ip, op, parm) {\ - BITBLK128V32_10(ip, 0, op, parm);\ - BITBLK128V32_10(ip, 1, op, parm); IPPE(ip); OPPE(op += 10*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_11(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 11));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 1));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 13));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 3));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 14));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 15));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 5));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 6));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 17));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 7));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 18));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 19));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 9));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 20));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 10));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 21)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_11(ip, op, parm) {\ - BITBLK128V32_11(ip, 0, op, parm); IPPE(ip); OPPE(op += 11*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_12(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 1, iv), 12));\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 3, iv), 4));\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 6, iv), 8));\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 20)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_12(ip, op, parm) {\ - BITBLK128V32_12(ip, 0, op, parm);\ - BITBLK128V32_12(ip, 1, op, parm);\ - BITBLK128V32_12(ip, 2, op, parm);\ - BITBLK128V32_12(ip, 3, op, parm); IPPE(ip); OPPE(op += 12*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_13(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 13));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 7));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 1));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 14));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 2));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 15));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 9));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 3));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 10));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 4));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 17));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 11));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 5));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 18));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 12));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 6));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 19)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_13(ip, op, parm) {\ - BITBLK128V32_13(ip, 0, op, parm); IPPE(ip); OPPE(op += 13*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_14(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 1, iv), 14));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 10));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 6));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 7, iv), 2));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 12));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 8));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+14, iv), 4));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 18)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_14(ip, op, parm) {\ - BITBLK128V32_14(ip, 0, op, parm);\ - BITBLK128V32_14(ip, 1, op, parm); IPPE(ip); OPPE(op += 14*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_15(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 1, iv), 15));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 13));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 11));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 9));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 7));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 5));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 3));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+15, iv), 1));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 14));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 12));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 10));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 6));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 4));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+30, iv), 2));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 17)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_15(ip, op, parm) {\ - BITBLK128V32_15(ip, 0, op, parm); IPPE(ip); OPPE(op += 15*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_16(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*2+ 0, iv, parm); ov = IPP(ip, i*2+ 0, iv);\ - VSTI(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*2+ 1, iv), 16)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_16(ip, op, parm) {\ - BITBLK128V32_16(ip, 0, op, parm);\ - BITBLK128V32_16(ip, 1, op, parm);\ - BITBLK128V32_16(ip, 2, op, parm);\ - BITBLK128V32_16(ip, 3, op, parm);\ - BITBLK128V32_16(ip, 4, op, parm);\ - BITBLK128V32_16(ip, 5, op, parm);\ - BITBLK128V32_16(ip, 6, op, parm);\ - BITBLK128V32_16(ip, 7, op, parm);\ - BITBLK128V32_16(ip, 8, op, parm);\ - BITBLK128V32_16(ip, 9, op, parm);\ - BITBLK128V32_16(ip, 10, op, parm);\ - BITBLK128V32_16(ip, 11, op, parm);\ - BITBLK128V32_16(ip, 12, op, parm);\ - BITBLK128V32_16(ip, 13, op, parm);\ - BITBLK128V32_16(ip, 14, op, parm);\ - BITBLK128V32_16(ip, 15, op, parm); IPPE(ip); OPPE(op += 16*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_17(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 10));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 12));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 14));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 1));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 3));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 5));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 7));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 9));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 11));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 13));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 15)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_17(ip, op, parm) {\ - BITBLK128V32_17(ip, 0, op, parm); IPPE(ip); OPPE(op += 17*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_18(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 2));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+11, iv), 6));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+13, iv), 10));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 14)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_18(ip, op, parm) {\ - BITBLK128V32_18(ip, 0, op, parm);\ - BITBLK128V32_18(ip, 1, op, parm); IPPE(ip); OPPE(op += 18*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_19(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 5));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 11));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 10));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 3));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 9));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 2));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 1));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 7));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 13)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_19(ip, op, parm) {\ - BITBLK128V32_19(ip, 0, op, parm); IPPE(ip); OPPE(op += 19*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_20(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 5, iv), 4));\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 12)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_20(ip, op, parm) {\ - BITBLK128V32_20(ip, 0, op, parm);\ - BITBLK128V32_20(ip, 1, op, parm);\ - BITBLK128V32_20(ip, 2, op, parm);\ - BITBLK128V32_20(ip, 3, op, parm); IPPE(ip); OPPE(op += 20*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_21(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 5, iv), 9));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+11, iv), 7));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 6));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 5));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+20, iv), 4));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+23, iv), 3));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+26, iv), 2));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+29, iv), 1));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 11)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_21(ip, op, parm) {\ - BITBLK128V32_21(ip, 0, op, parm); IPPE(ip); OPPE(op += 21*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_22(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 3, iv), 2));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 9, iv), 6));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+12, iv), 8));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 10)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_22(ip, op, parm) {\ - BITBLK128V32_22(ip, 0, op, parm);\ - BITBLK128V32_22(ip, 1, op, parm); IPPE(ip); OPPE(op += 22*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_23(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 3, iv), 5));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 7, iv), 1));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 6));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+14, iv), 2));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+17, iv), 7));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 3));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+28, iv), 4));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 9)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_23(ip, op, parm) {\ - BITBLK128V32_23(ip, 0, op, parm); IPPE(ip); OPPE(op += 23*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_24(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*4+ 3, iv), 8)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_24(ip, op, parm) {\ - BITBLK128V32_24(ip, 0, op, parm);\ - BITBLK128V32_24(ip, 1, op, parm);\ - BITBLK128V32_24(ip, 2, op, parm);\ - BITBLK128V32_24(ip, 3, op, parm);\ - BITBLK128V32_24(ip, 4, op, parm);\ - BITBLK128V32_24(ip, 5, op, parm);\ - BITBLK128V32_24(ip, 6, op, parm);\ - BITBLK128V32_24(ip, 7, op, parm); IPPE(ip); OPPE(op += 24*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_25(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 9, iv), 1));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+13, iv), 5));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+18, iv), 2));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+22, iv), 6));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+27, iv), 3));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 7)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_25(ip, op, parm) {\ - BITBLK128V32_25(ip, 0, op, parm); IPPE(ip); OPPE(op += 25*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_26(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+ 5, iv), 2));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+10, iv), 4));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 6)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_26(ip, op, parm) {\ - BITBLK128V32_26(ip, 0, op, parm);\ - BITBLK128V32_26(ip, 1, op, parm); IPPE(ip); OPPE(op += 26*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_27(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+19, iv), 1));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+25, iv), 3));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 5)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_27(ip, op, parm) {\ - BITBLK128V32_27(ip, 0, op, parm); IPPE(ip); OPPE(op += 27*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_28(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*8+ 7, iv), 4)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_28(ip, op, parm) {\ - BITBLK128V32_28(ip, 0, op, parm);\ - BITBLK128V32_28(ip, 1, op, parm);\ - BITBLK128V32_28(ip, 2, op, parm);\ - BITBLK128V32_28(ip, 3, op, parm); IPPE(ip); OPPE(op += 28*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_29(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+10, iv), 2));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+21, iv), 1));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 3)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_29(ip, op, parm) {\ - BITBLK128V32_29(ip, 0, op, parm); IPPE(ip); OPPE(op += 29*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_30(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*16+15, iv), 2)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_30(ip, op, parm) {\ - BITBLK128V32_30(ip, 0, op, parm);\ - BITBLK128V32_30(ip, 1, op, parm); IPPE(ip); OPPE(op += 30*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_31(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32( IPP(ip, i*32+31, iv), 1)); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_31(ip, op, parm) {\ - BITBLK128V32_31(ip, 0, op, parm); IPPE(ip); OPPE(op += 31*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_32(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*1+ 0, iv, parm); ov = IPP(ip, i*1+ 0, iv); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_32(ip, op, parm) {\ - BITBLK128V32_32(ip, 0, op, parm);\ - BITBLK128V32_32(ip, 1, op, parm);\ - BITBLK128V32_32(ip, 2, op, parm);\ - BITBLK128V32_32(ip, 3, op, parm);\ - BITBLK128V32_32(ip, 4, op, parm);\ - BITBLK128V32_32(ip, 5, op, parm);\ - BITBLK128V32_32(ip, 6, op, parm);\ - BITBLK128V32_32(ip, 7, op, parm);\ - BITBLK128V32_32(ip, 8, op, parm);\ - BITBLK128V32_32(ip, 9, op, parm);\ - BITBLK128V32_32(ip, 10, op, parm);\ - BITBLK128V32_32(ip, 11, op, parm);\ - BITBLK128V32_32(ip, 12, op, parm);\ - BITBLK128V32_32(ip, 13, op, parm);\ - BITBLK128V32_32(ip, 14, op, parm);\ - BITBLK128V32_32(ip, 15, op, parm);\ - BITBLK128V32_32(ip, 16, op, parm);\ - BITBLK128V32_32(ip, 17, op, parm);\ - BITBLK128V32_32(ip, 18, op, parm);\ - BITBLK128V32_32(ip, 19, op, parm);\ - BITBLK128V32_32(ip, 20, op, parm);\ - BITBLK128V32_32(ip, 21, op, parm);\ - BITBLK128V32_32(ip, 22, op, parm);\ - BITBLK128V32_32(ip, 23, op, parm);\ - BITBLK128V32_32(ip, 24, op, parm);\ - BITBLK128V32_32(ip, 25, op, parm);\ - BITBLK128V32_32(ip, 26, op, parm);\ - BITBLK128V32_32(ip, 27, op, parm);\ - BITBLK128V32_32(ip, 28, op, parm);\ - BITBLK128V32_32(ip, 29, op, parm);\ - BITBLK128V32_32(ip, 30, op, parm);\ - BITBLK128V32_32(ip, 31, op, parm); IPPE(ip); OPPE(op += 32*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_33(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_33(ip, op, parm) {\ - BITBLK128V32_33(ip, 0, op, parm); IPPE(ip); OPPE(op += 33*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_34(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_34(ip, op, parm) {\ - BITBLK128V32_34(ip, 0, op, parm);\ - BITBLK128V32_34(ip, 1, op, parm); IPPE(ip); OPPE(op += 34*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_35(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_35(ip, op, parm) {\ - BITBLK128V32_35(ip, 0, op, parm); IPPE(ip); OPPE(op += 35*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_36(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_36(ip, op, parm) {\ - BITBLK128V32_36(ip, 0, op, parm);\ - BITBLK128V32_36(ip, 1, op, parm);\ - BITBLK128V32_36(ip, 2, op, parm);\ - BITBLK128V32_36(ip, 3, op, parm); IPPE(ip); OPPE(op += 36*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_37(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_37(ip, op, parm) {\ - BITBLK128V32_37(ip, 0, op, parm); IPPE(ip); OPPE(op += 37*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_38(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_38(ip, op, parm) {\ - BITBLK128V32_38(ip, 0, op, parm);\ - BITBLK128V32_38(ip, 1, op, parm); IPPE(ip); OPPE(op += 38*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_39(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_39(ip, op, parm) {\ - BITBLK128V32_39(ip, 0, op, parm); IPPE(ip); OPPE(op += 39*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_40(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_40(ip, op, parm) {\ - BITBLK128V32_40(ip, 0, op, parm);\ - BITBLK128V32_40(ip, 1, op, parm);\ - BITBLK128V32_40(ip, 2, op, parm);\ - BITBLK128V32_40(ip, 3, op, parm);\ - BITBLK128V32_40(ip, 4, op, parm);\ - BITBLK128V32_40(ip, 5, op, parm);\ - BITBLK128V32_40(ip, 6, op, parm);\ - BITBLK128V32_40(ip, 7, op, parm); IPPE(ip); OPPE(op += 40*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_41(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_41(ip, op, parm) {\ - BITBLK128V32_41(ip, 0, op, parm); IPPE(ip); OPPE(op += 41*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_42(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_42(ip, op, parm) {\ - BITBLK128V32_42(ip, 0, op, parm);\ - BITBLK128V32_42(ip, 1, op, parm); IPPE(ip); OPPE(op += 42*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_43(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_43(ip, op, parm) {\ - BITBLK128V32_43(ip, 0, op, parm); IPPE(ip); OPPE(op += 43*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_44(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_44(ip, op, parm) {\ - BITBLK128V32_44(ip, 0, op, parm);\ - BITBLK128V32_44(ip, 1, op, parm);\ - BITBLK128V32_44(ip, 2, op, parm);\ - BITBLK128V32_44(ip, 3, op, parm); IPPE(ip); OPPE(op += 44*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_45(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_45(ip, op, parm) {\ - BITBLK128V32_45(ip, 0, op, parm); IPPE(ip); OPPE(op += 45*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_46(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_46(ip, op, parm) {\ - BITBLK128V32_46(ip, 0, op, parm);\ - BITBLK128V32_46(ip, 1, op, parm); IPPE(ip); OPPE(op += 46*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_47(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_47(ip, op, parm) {\ - BITBLK128V32_47(ip, 0, op, parm); IPPE(ip); OPPE(op += 47*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_48(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*2+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*2+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*2+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*2+ 1, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_48(ip, op, parm) {\ - BITBLK128V32_48(ip, 0, op, parm);\ - BITBLK128V32_48(ip, 1, op, parm);\ - BITBLK128V32_48(ip, 2, op, parm);\ - BITBLK128V32_48(ip, 3, op, parm);\ - BITBLK128V32_48(ip, 4, op, parm);\ - BITBLK128V32_48(ip, 5, op, parm);\ - BITBLK128V32_48(ip, 6, op, parm);\ - BITBLK128V32_48(ip, 7, op, parm);\ - BITBLK128V32_48(ip, 8, op, parm);\ - BITBLK128V32_48(ip, 9, op, parm);\ - BITBLK128V32_48(ip, 10, op, parm);\ - BITBLK128V32_48(ip, 11, op, parm);\ - BITBLK128V32_48(ip, 12, op, parm);\ - BITBLK128V32_48(ip, 13, op, parm);\ - BITBLK128V32_48(ip, 14, op, parm);\ - BITBLK128V32_48(ip, 15, op, parm); IPPE(ip); OPPE(op += 48*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_49(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_49(ip, op, parm) {\ - BITBLK128V32_49(ip, 0, op, parm); IPPE(ip); OPPE(op += 49*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_50(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_50(ip, op, parm) {\ - BITBLK128V32_50(ip, 0, op, parm);\ - BITBLK128V32_50(ip, 1, op, parm); IPPE(ip); OPPE(op += 50*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_51(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_51(ip, op, parm) {\ - BITBLK128V32_51(ip, 0, op, parm); IPPE(ip); OPPE(op += 51*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_52(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_52(ip, op, parm) {\ - BITBLK128V32_52(ip, 0, op, parm);\ - BITBLK128V32_52(ip, 1, op, parm);\ - BITBLK128V32_52(ip, 2, op, parm);\ - BITBLK128V32_52(ip, 3, op, parm); IPPE(ip); OPPE(op += 52*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_53(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_53(ip, op, parm) {\ - BITBLK128V32_53(ip, 0, op, parm); IPPE(ip); OPPE(op += 53*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_54(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_54(ip, op, parm) {\ - BITBLK128V32_54(ip, 0, op, parm);\ - BITBLK128V32_54(ip, 1, op, parm); IPPE(ip); OPPE(op += 54*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_55(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_55(ip, op, parm) {\ - BITBLK128V32_55(ip, 0, op, parm); IPPE(ip); OPPE(op += 55*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_56(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_56(ip, op, parm) {\ - BITBLK128V32_56(ip, 0, op, parm);\ - BITBLK128V32_56(ip, 1, op, parm);\ - BITBLK128V32_56(ip, 2, op, parm);\ - BITBLK128V32_56(ip, 3, op, parm);\ - BITBLK128V32_56(ip, 4, op, parm);\ - BITBLK128V32_56(ip, 5, op, parm);\ - BITBLK128V32_56(ip, 6, op, parm);\ - BITBLK128V32_56(ip, 7, op, parm); IPPE(ip); OPPE(op += 56*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_57(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_57(ip, op, parm) {\ - BITBLK128V32_57(ip, 0, op, parm); IPPE(ip); OPPE(op += 57*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_58(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_58(ip, op, parm) {\ - BITBLK128V32_58(ip, 0, op, parm);\ - BITBLK128V32_58(ip, 1, op, parm); IPPE(ip); OPPE(op += 58*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_59(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_59(ip, op, parm) {\ - BITBLK128V32_59(ip, 0, op, parm); IPPE(ip); OPPE(op += 59*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_60(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_60(ip, op, parm) {\ - BITBLK128V32_60(ip, 0, op, parm);\ - BITBLK128V32_60(ip, 1, op, parm);\ - BITBLK128V32_60(ip, 2, op, parm);\ - BITBLK128V32_60(ip, 3, op, parm); IPPE(ip); OPPE(op += 60*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_61(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_61(ip, op, parm) {\ - BITBLK128V32_61(ip, 0, op, parm); IPPE(ip); OPPE(op += 61*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_62(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*16+15, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_62(ip, op, parm) {\ - BITBLK128V32_62(ip, 0, op, parm);\ - BITBLK128V32_62(ip, 1, op, parm); IPPE(ip); OPPE(op += 62*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_63(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 9);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 11);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 13);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 15);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 17);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 18);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 19);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 21);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 22);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 23);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 25);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 26);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 27);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 29);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 30);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*32+31, iv), 1)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 31); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_63(ip, op, parm) {\ - BITBLK128V32_63(ip, 0, op, parm); IPPE(ip); OPPE(op += 63*4/sizeof(op[0]));\ -} - -#define BITBLK128V32_64(ip, i, op, parm) { __m128i ov,iv;\ - VSTI(ip, i*1+ 0, iv, parm); ov = _mm_or_si128(ov, _mm_slli_epi32(iv = IPP(ip, i*1+ 0, iv), 0)); _mm_storeu_si128(op++, ov); ov = _mm_srli_epi32(iv, 32); _mm_storeu_si128((__m128i *)op++, ov);\ -} - -#define BITPACK128V32_64(ip, op, parm) {\ - BITBLK128V32_64(ip, 0, op, parm);\ - BITBLK128V32_64(ip, 1, op, parm);\ - BITBLK128V32_64(ip, 2, op, parm);\ - BITBLK128V32_64(ip, 3, op, parm);\ - BITBLK128V32_64(ip, 4, op, parm);\ - BITBLK128V32_64(ip, 5, op, parm);\ - BITBLK128V32_64(ip, 6, op, parm);\ - BITBLK128V32_64(ip, 7, op, parm);\ - BITBLK128V32_64(ip, 8, op, parm);\ - BITBLK128V32_64(ip, 9, op, parm);\ - BITBLK128V32_64(ip, 10, op, parm);\ - BITBLK128V32_64(ip, 11, op, parm);\ - BITBLK128V32_64(ip, 12, op, parm);\ - BITBLK128V32_64(ip, 13, op, parm);\ - BITBLK128V32_64(ip, 14, op, parm);\ - BITBLK128V32_64(ip, 15, op, parm);\ - BITBLK128V32_64(ip, 16, op, parm);\ - BITBLK128V32_64(ip, 17, op, parm);\ - BITBLK128V32_64(ip, 18, op, parm);\ - BITBLK128V32_64(ip, 19, op, parm);\ - BITBLK128V32_64(ip, 20, op, parm);\ - BITBLK128V32_64(ip, 21, op, parm);\ - BITBLK128V32_64(ip, 22, op, parm);\ - BITBLK128V32_64(ip, 23, op, parm);\ - BITBLK128V32_64(ip, 24, op, parm);\ - BITBLK128V32_64(ip, 25, op, parm);\ - BITBLK128V32_64(ip, 26, op, parm);\ - BITBLK128V32_64(ip, 27, op, parm);\ - BITBLK128V32_64(ip, 28, op, parm);\ - BITBLK128V32_64(ip, 29, op, parm);\ - BITBLK128V32_64(ip, 30, op, parm);\ - BITBLK128V32_64(ip, 31, op, parm); IPPE(ip); OPPE(op += 64*4/sizeof(op[0]));\ -} - -#define BITPACK128V32(__pip, __nbits, __pop, __parm) { __m128i *__ip=(__m128i *)__pip,*__op=(__m128i *)__pop;\ - switch(__nbits) {\ - case 0: break;\ - case 1:{ BITPACK128V32_1( __ip, __op, __parm); } break;\ - case 2:{ BITPACK128V32_2( __ip, __op, __parm); } break;\ - case 3:{ BITPACK128V32_3( __ip, __op, __parm); } break;\ - case 4:{ BITPACK128V32_4( __ip, __op, __parm); } break;\ - case 5:{ BITPACK128V32_5( __ip, __op, __parm); } break;\ - case 6:{ BITPACK128V32_6( __ip, __op, __parm); } break;\ - case 7:{ BITPACK128V32_7( __ip, __op, __parm); } break;\ - case 8:{ BITPACK128V32_8( __ip, __op, __parm); } break;\ - case 9:{ BITPACK128V32_9( __ip, __op, __parm); } break;\ - case 10:{ BITPACK128V32_10(__ip, __op, __parm); } break;\ - case 11:{ BITPACK128V32_11(__ip, __op, __parm); } break;\ - case 12:{ BITPACK128V32_12(__ip, __op, __parm); } break;\ - case 13:{ BITPACK128V32_13(__ip, __op, __parm); } break;\ - case 14:{ BITPACK128V32_14(__ip, __op, __parm); } break;\ - case 15:{ BITPACK128V32_15(__ip, __op, __parm); } break;\ - case 16:{ BITPACK128V32_16(__ip, __op, __parm); } break;\ - case 17:{ BITPACK128V32_17(__ip, __op, __parm); } break;\ - case 18:{ BITPACK128V32_18(__ip, __op, __parm); } break;\ - case 19:{ BITPACK128V32_19(__ip, __op, __parm); } break;\ - case 20:{ BITPACK128V32_20(__ip, __op, __parm); } break;\ - case 21:{ BITPACK128V32_21(__ip, __op, __parm); } break;\ - case 22:{ BITPACK128V32_22(__ip, __op, __parm); } break;\ - case 23:{ BITPACK128V32_23(__ip, __op, __parm); } break;\ - case 24:{ BITPACK128V32_24(__ip, __op, __parm); } break;\ - case 25:{ BITPACK128V32_25(__ip, __op, __parm); } break;\ - case 26:{ BITPACK128V32_26(__ip, __op, __parm); } break;\ - case 27:{ BITPACK128V32_27(__ip, __op, __parm); } break;\ - case 28:{ BITPACK128V32_28(__ip, __op, __parm); } break;\ - case 29:{ BITPACK128V32_29(__ip, __op, __parm); } break;\ - case 30:{ BITPACK128V32_30(__ip, __op, __parm); } break;\ - case 31:{ BITPACK128V32_31(__ip, __op, __parm); } break;\ - case 32:{ BITPACK128V32_32(__ip, __op, __parm); } break;\ - }\ -} - diff --git a/bitpack256v_.h b/bitpack256v_.h deleted file mode 100644 index 595831b..0000000 --- a/bitpack256v_.h +++ /dev/null @@ -1,2036 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// TurboPFor: Integer Compression SIMD bit packing -#define BITBLK256V32_1(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 1));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 3));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 5));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 7));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 9));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 10));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 11));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 12));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 13));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 14));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 15));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 17));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 18));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 19));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 20));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 21));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 22));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 23));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 24));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 25));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 26));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 27));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 28));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 29));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 30));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 31)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_1(ip, op, parm) {\ - BITBLK256V32_1(ip, 0, op, parm); IPPE(ip); OPPE(op += 1*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_2(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 2));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 6));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 10));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 14));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 18));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 20));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 22));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 24));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 26));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 28));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 30)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_2(ip, op, parm) {\ - BITBLK256V32_2(ip, 0, op, parm);\ - BITBLK256V32_2(ip, 1, op, parm); IPPE(ip); OPPE(op += 2*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_3(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 3));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 9));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 15));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 18));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 21));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 27));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 1));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 7));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 10));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 13));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 19));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 22));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 25));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 28));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 2));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 5));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 11));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 14));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 17));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 20));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 23));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 26));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 29)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_3(ip, op, parm) {\ - BITBLK256V32_3(ip, 0, op, parm); IPPE(ip); OPPE(op += 3*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_4(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 1, iv), 4));\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 3, iv), 12));\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 5, iv), 20));\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 6, iv), 24));\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 28)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_4(ip, op, parm) {\ - BITBLK256V32_4(ip, 0, op, parm);\ - BITBLK256V32_4(ip, 1, op, parm);\ - BITBLK256V32_4(ip, 2, op, parm);\ - BITBLK256V32_4(ip, 3, op, parm); IPPE(ip); OPPE(op += 4*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_5(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 5));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 15));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 20));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 25));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 3));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 13));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 18));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 23));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 1));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 6));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 11));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 21));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 26));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 4));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 9));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 14));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 19));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 24));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 2));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 7));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 12));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 17));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 22));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 27)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_5(ip, op, parm) {\ - BITBLK256V32_5(ip, 0, op, parm); IPPE(ip); OPPE(op += 5*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_6(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 6));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 12));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 18));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 24));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 10));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 22));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 2));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 8));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 14));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 20));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 26)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_6(ip, op, parm) {\ - BITBLK256V32_6(ip, 0, op, parm);\ - BITBLK256V32_6(ip, 1, op, parm); IPPE(ip); OPPE(op += 6*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_7(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 7));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 14));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 21));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 3));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 10));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 17));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 24));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 6));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 13));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 20));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 2));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 9));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 23));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 5));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 12));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 19));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 1));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 15));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 22));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 4));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 11));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 18));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 25)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_7(ip, op, parm) {\ - BITBLK256V32_7(ip, 0, op, parm); IPPE(ip); OPPE(op += 7*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_8(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 1, iv), 8));\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 2, iv), 16));\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 3, iv), 24)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_8(ip, op, parm) {\ - BITBLK256V32_8(ip, 0, op, parm);\ - BITBLK256V32_8(ip, 1, op, parm);\ - BITBLK256V32_8(ip, 2, op, parm);\ - BITBLK256V32_8(ip, 3, op, parm);\ - BITBLK256V32_8(ip, 4, op, parm);\ - BITBLK256V32_8(ip, 5, op, parm);\ - BITBLK256V32_8(ip, 6, op, parm);\ - BITBLK256V32_8(ip, 7, op, parm); IPPE(ip); OPPE(op += 8*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_9(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 9));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 18));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 13));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 22));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 17));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 3));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 12));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 21));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 7));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 2));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 11));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 20));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 6));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 15));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 1));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 10));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 19));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 5));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 14));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 23)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_9(ip, op, parm) {\ - BITBLK256V32_9(ip, 0, op, parm); IPPE(ip); OPPE(op += 9*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_10(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 10));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 20));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 18));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 6));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 4));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 14));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 2));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 12));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 22)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_10(ip, op, parm) {\ - BITBLK256V32_10(ip, 0, op, parm);\ - BITBLK256V32_10(ip, 1, op, parm); IPPE(ip); OPPE(op += 10*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_11(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 11));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 1));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 13));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 3));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 14));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 15));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 5));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 6));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 17));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 7));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 18));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 19));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 9));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 20));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 10));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 21)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_11(ip, op, parm) {\ - BITBLK256V32_11(ip, 0, op, parm); IPPE(ip); OPPE(op += 11*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_12(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 1, iv), 12));\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 3, iv), 4));\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 4, iv), 16));\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 6, iv), 8));\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 20)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_12(ip, op, parm) {\ - BITBLK256V32_12(ip, 0, op, parm);\ - BITBLK256V32_12(ip, 1, op, parm);\ - BITBLK256V32_12(ip, 2, op, parm);\ - BITBLK256V32_12(ip, 3, op, parm); IPPE(ip); OPPE(op += 12*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_13(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 13));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 7));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 1));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 14));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 2));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 15));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 9));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 3));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 10));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 4));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 17));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 11));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 5));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 18));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 12));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 6));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 19)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_13(ip, op, parm) {\ - BITBLK256V32_13(ip, 0, op, parm); IPPE(ip); OPPE(op += 13*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_14(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 1, iv), 14));\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 10));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 6));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 7, iv), 2));\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 8, iv), 16));\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 12));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 8));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+14, iv), 4));\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 18)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_14(ip, op, parm) {\ - BITBLK256V32_14(ip, 0, op, parm);\ - BITBLK256V32_14(ip, 1, op, parm); IPPE(ip); OPPE(op += 14*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_15(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 1, iv), 15));\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 13));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 11));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 9));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 7));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 5));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 3));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+15, iv), 1));\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+16, iv), 16));\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 14));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 12));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 10));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 6));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 4));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+30, iv), 2));\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 17)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_15(ip, op, parm) {\ - BITBLK256V32_15(ip, 0, op, parm); IPPE(ip); OPPE(op += 15*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_16(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*2+ 0, iv, parm); ov = IPP(ip, i*2+ 0, iv);\ - VSTI(ip, i*2+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*2+ 1, iv), 16)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_16(ip, op, parm) {\ - BITBLK256V32_16(ip, 0, op, parm);\ - BITBLK256V32_16(ip, 1, op, parm);\ - BITBLK256V32_16(ip, 2, op, parm);\ - BITBLK256V32_16(ip, 3, op, parm);\ - BITBLK256V32_16(ip, 4, op, parm);\ - BITBLK256V32_16(ip, 5, op, parm);\ - BITBLK256V32_16(ip, 6, op, parm);\ - BITBLK256V32_16(ip, 7, op, parm);\ - BITBLK256V32_16(ip, 8, op, parm);\ - BITBLK256V32_16(ip, 9, op, parm);\ - BITBLK256V32_16(ip, 10, op, parm);\ - BITBLK256V32_16(ip, 11, op, parm);\ - BITBLK256V32_16(ip, 12, op, parm);\ - BITBLK256V32_16(ip, 13, op, parm);\ - BITBLK256V32_16(ip, 14, op, parm);\ - BITBLK256V32_16(ip, 15, op, parm); IPPE(ip); OPPE(op += 16*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_17(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 2));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 6));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 10));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 12));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 14));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 1));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 3));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 5));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 7));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 9));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 11));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 13));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 15)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_17(ip, op, parm) {\ - BITBLK256V32_17(ip, 0, op, parm); IPPE(ip); OPPE(op += 17*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_18(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 2, iv), 4));\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 4, iv), 8));\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 12));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 2));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+11, iv), 6));\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+13, iv), 10));\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 14)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_18(ip, op, parm) {\ - BITBLK256V32_18(ip, 0, op, parm);\ - BITBLK256V32_18(ip, 1, op, parm); IPPE(ip); OPPE(op += 18*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_19(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 6));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 12));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 5));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 11));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 10));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 3));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 9));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 2));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 1));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 7));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 13)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_19(ip, op, parm) {\ - BITBLK256V32_19(ip, 0, op, parm); IPPE(ip); OPPE(op += 19*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_20(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 2, iv), 8));\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 5, iv), 4));\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 12)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_20(ip, op, parm) {\ - BITBLK256V32_20(ip, 0, op, parm);\ - BITBLK256V32_20(ip, 1, op, parm);\ - BITBLK256V32_20(ip, 2, op, parm);\ - BITBLK256V32_20(ip, 3, op, parm); IPPE(ip); OPPE(op += 20*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_21(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 2, iv), 10));\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 5, iv), 9));\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 8, iv), 8));\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+11, iv), 7));\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 6));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 5));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+20, iv), 4));\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+23, iv), 3));\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+26, iv), 2));\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+29, iv), 1));\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 11)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_21(ip, op, parm) {\ - BITBLK256V32_21(ip, 0, op, parm); IPPE(ip); OPPE(op += 21*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_22(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 3, iv), 2));\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 6, iv), 4));\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 9, iv), 6));\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+12, iv), 8));\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 10)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_22(ip, op, parm) {\ - BITBLK256V32_22(ip, 0, op, parm);\ - BITBLK256V32_22(ip, 1, op, parm); IPPE(ip); OPPE(op += 22*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_23(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 3, iv), 5));\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 7, iv), 1));\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 6));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+14, iv), 2));\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+17, iv), 7));\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 3));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+24, iv), 8));\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+28, iv), 4));\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 9)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_23(ip, op, parm) {\ - BITBLK256V32_23(ip, 0, op, parm); IPPE(ip); OPPE(op += 23*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_24(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = IPP(ip, i*4+ 0, iv);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*4+ 3, iv), 8)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_24(ip, op, parm) {\ - BITBLK256V32_24(ip, 0, op, parm);\ - BITBLK256V32_24(ip, 1, op, parm);\ - BITBLK256V32_24(ip, 2, op, parm);\ - BITBLK256V32_24(ip, 3, op, parm);\ - BITBLK256V32_24(ip, 4, op, parm);\ - BITBLK256V32_24(ip, 5, op, parm);\ - BITBLK256V32_24(ip, 6, op, parm);\ - BITBLK256V32_24(ip, 7, op, parm); IPPE(ip); OPPE(op += 24*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_25(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 4, iv), 4));\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 9, iv), 1));\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+13, iv), 5));\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+18, iv), 2));\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+22, iv), 6));\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+27, iv), 3));\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 7)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_25(ip, op, parm) {\ - BITBLK256V32_25(ip, 0, op, parm); IPPE(ip); OPPE(op += 25*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_26(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+ 5, iv), 2));\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+10, iv), 4));\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 6)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_26(ip, op, parm) {\ - BITBLK256V32_26(ip, 0, op, parm);\ - BITBLK256V32_26(ip, 1, op, parm); IPPE(ip); OPPE(op += 26*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_27(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+ 6, iv), 2));\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+12, iv), 4));\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+19, iv), 1));\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+25, iv), 3));\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 5)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_27(ip, op, parm) {\ - BITBLK256V32_27(ip, 0, op, parm); IPPE(ip); OPPE(op += 27*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_28(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = IPP(ip, i*8+ 0, iv);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*8+ 7, iv), 4)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_28(ip, op, parm) {\ - BITBLK256V32_28(ip, 0, op, parm);\ - BITBLK256V32_28(ip, 1, op, parm);\ - BITBLK256V32_28(ip, 2, op, parm);\ - BITBLK256V32_28(ip, 3, op, parm); IPPE(ip); OPPE(op += 28*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_29(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+10, iv), 2));\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+21, iv), 1));\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 3)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_29(ip, op, parm) {\ - BITBLK256V32_29(ip, 0, op, parm); IPPE(ip); OPPE(op += 29*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_30(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = IPP(ip, i*16+ 0, iv);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*16+15, iv), 2)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_30(ip, op, parm) {\ - BITBLK256V32_30(ip, 0, op, parm);\ - BITBLK256V32_30(ip, 1, op, parm); IPPE(ip); OPPE(op += 30*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_31(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = IPP(ip, i*32+ 0, iv);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32( IPP(ip, i*32+31, iv), 1)); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_31(ip, op, parm) {\ - BITBLK256V32_31(ip, 0, op, parm); IPPE(ip); OPPE(op += 31*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_32(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*1+ 0, iv, parm); ov = IPP(ip, i*1+ 0, iv); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_32(ip, op, parm) {\ - BITBLK256V32_32(ip, 0, op, parm);\ - BITBLK256V32_32(ip, 1, op, parm);\ - BITBLK256V32_32(ip, 2, op, parm);\ - BITBLK256V32_32(ip, 3, op, parm);\ - BITBLK256V32_32(ip, 4, op, parm);\ - BITBLK256V32_32(ip, 5, op, parm);\ - BITBLK256V32_32(ip, 6, op, parm);\ - BITBLK256V32_32(ip, 7, op, parm);\ - BITBLK256V32_32(ip, 8, op, parm);\ - BITBLK256V32_32(ip, 9, op, parm);\ - BITBLK256V32_32(ip, 10, op, parm);\ - BITBLK256V32_32(ip, 11, op, parm);\ - BITBLK256V32_32(ip, 12, op, parm);\ - BITBLK256V32_32(ip, 13, op, parm);\ - BITBLK256V32_32(ip, 14, op, parm);\ - BITBLK256V32_32(ip, 15, op, parm);\ - BITBLK256V32_32(ip, 16, op, parm);\ - BITBLK256V32_32(ip, 17, op, parm);\ - BITBLK256V32_32(ip, 18, op, parm);\ - BITBLK256V32_32(ip, 19, op, parm);\ - BITBLK256V32_32(ip, 20, op, parm);\ - BITBLK256V32_32(ip, 21, op, parm);\ - BITBLK256V32_32(ip, 22, op, parm);\ - BITBLK256V32_32(ip, 23, op, parm);\ - BITBLK256V32_32(ip, 24, op, parm);\ - BITBLK256V32_32(ip, 25, op, parm);\ - BITBLK256V32_32(ip, 26, op, parm);\ - BITBLK256V32_32(ip, 27, op, parm);\ - BITBLK256V32_32(ip, 28, op, parm);\ - BITBLK256V32_32(ip, 29, op, parm);\ - BITBLK256V32_32(ip, 30, op, parm);\ - BITBLK256V32_32(ip, 31, op, parm); IPPE(ip); OPPE(op += 32*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_33(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_33(ip, op, parm) {\ - BITBLK256V32_33(ip, 0, op, parm); IPPE(ip); OPPE(op += 33*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_34(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_34(ip, op, parm) {\ - BITBLK256V32_34(ip, 0, op, parm);\ - BITBLK256V32_34(ip, 1, op, parm); IPPE(ip); OPPE(op += 34*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_35(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_35(ip, op, parm) {\ - BITBLK256V32_35(ip, 0, op, parm); IPPE(ip); OPPE(op += 35*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_36(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_36(ip, op, parm) {\ - BITBLK256V32_36(ip, 0, op, parm);\ - BITBLK256V32_36(ip, 1, op, parm);\ - BITBLK256V32_36(ip, 2, op, parm);\ - BITBLK256V32_36(ip, 3, op, parm); IPPE(ip); OPPE(op += 36*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_37(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_37(ip, op, parm) {\ - BITBLK256V32_37(ip, 0, op, parm); IPPE(ip); OPPE(op += 37*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_38(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_38(ip, op, parm) {\ - BITBLK256V32_38(ip, 0, op, parm);\ - BITBLK256V32_38(ip, 1, op, parm); IPPE(ip); OPPE(op += 38*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_39(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_39(ip, op, parm) {\ - BITBLK256V32_39(ip, 0, op, parm); IPPE(ip); OPPE(op += 39*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_40(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_40(ip, op, parm) {\ - BITBLK256V32_40(ip, 0, op, parm);\ - BITBLK256V32_40(ip, 1, op, parm);\ - BITBLK256V32_40(ip, 2, op, parm);\ - BITBLK256V32_40(ip, 3, op, parm);\ - BITBLK256V32_40(ip, 4, op, parm);\ - BITBLK256V32_40(ip, 5, op, parm);\ - BITBLK256V32_40(ip, 6, op, parm);\ - BITBLK256V32_40(ip, 7, op, parm); IPPE(ip); OPPE(op += 40*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_41(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_41(ip, op, parm) {\ - BITBLK256V32_41(ip, 0, op, parm); IPPE(ip); OPPE(op += 41*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_42(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_42(ip, op, parm) {\ - BITBLK256V32_42(ip, 0, op, parm);\ - BITBLK256V32_42(ip, 1, op, parm); IPPE(ip); OPPE(op += 42*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_43(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_43(ip, op, parm) {\ - BITBLK256V32_43(ip, 0, op, parm); IPPE(ip); OPPE(op += 43*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_44(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_44(ip, op, parm) {\ - BITBLK256V32_44(ip, 0, op, parm);\ - BITBLK256V32_44(ip, 1, op, parm);\ - BITBLK256V32_44(ip, 2, op, parm);\ - BITBLK256V32_44(ip, 3, op, parm); IPPE(ip); OPPE(op += 44*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_45(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_45(ip, op, parm) {\ - BITBLK256V32_45(ip, 0, op, parm); IPPE(ip); OPPE(op += 45*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_46(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_46(ip, op, parm) {\ - BITBLK256V32_46(ip, 0, op, parm);\ - BITBLK256V32_46(ip, 1, op, parm); IPPE(ip); OPPE(op += 46*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_47(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_47(ip, op, parm) {\ - BITBLK256V32_47(ip, 0, op, parm); IPPE(ip); OPPE(op += 47*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_48(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*2+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*2+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*2+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*2+ 1, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_48(ip, op, parm) {\ - BITBLK256V32_48(ip, 0, op, parm);\ - BITBLK256V32_48(ip, 1, op, parm);\ - BITBLK256V32_48(ip, 2, op, parm);\ - BITBLK256V32_48(ip, 3, op, parm);\ - BITBLK256V32_48(ip, 4, op, parm);\ - BITBLK256V32_48(ip, 5, op, parm);\ - BITBLK256V32_48(ip, 6, op, parm);\ - BITBLK256V32_48(ip, 7, op, parm);\ - BITBLK256V32_48(ip, 8, op, parm);\ - BITBLK256V32_48(ip, 9, op, parm);\ - BITBLK256V32_48(ip, 10, op, parm);\ - BITBLK256V32_48(ip, 11, op, parm);\ - BITBLK256V32_48(ip, 12, op, parm);\ - BITBLK256V32_48(ip, 13, op, parm);\ - BITBLK256V32_48(ip, 14, op, parm);\ - BITBLK256V32_48(ip, 15, op, parm); IPPE(ip); OPPE(op += 48*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_49(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_49(ip, op, parm) {\ - BITBLK256V32_49(ip, 0, op, parm); IPPE(ip); OPPE(op += 49*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_50(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_50(ip, op, parm) {\ - BITBLK256V32_50(ip, 0, op, parm);\ - BITBLK256V32_50(ip, 1, op, parm); IPPE(ip); OPPE(op += 50*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_51(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_51(ip, op, parm) {\ - BITBLK256V32_51(ip, 0, op, parm); IPPE(ip); OPPE(op += 51*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_52(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_52(ip, op, parm) {\ - BITBLK256V32_52(ip, 0, op, parm);\ - BITBLK256V32_52(ip, 1, op, parm);\ - BITBLK256V32_52(ip, 2, op, parm);\ - BITBLK256V32_52(ip, 3, op, parm); IPPE(ip); OPPE(op += 52*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_53(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_53(ip, op, parm) {\ - BITBLK256V32_53(ip, 0, op, parm); IPPE(ip); OPPE(op += 53*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_54(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_54(ip, op, parm) {\ - BITBLK256V32_54(ip, 0, op, parm);\ - BITBLK256V32_54(ip, 1, op, parm); IPPE(ip); OPPE(op += 54*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_55(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_55(ip, op, parm) {\ - BITBLK256V32_55(ip, 0, op, parm); IPPE(ip); OPPE(op += 55*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_56(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*4+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*4+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 1, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*4+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 2, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*4+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*4+ 3, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_56(ip, op, parm) {\ - BITBLK256V32_56(ip, 0, op, parm);\ - BITBLK256V32_56(ip, 1, op, parm);\ - BITBLK256V32_56(ip, 2, op, parm);\ - BITBLK256V32_56(ip, 3, op, parm);\ - BITBLK256V32_56(ip, 4, op, parm);\ - BITBLK256V32_56(ip, 5, op, parm);\ - BITBLK256V32_56(ip, 6, op, parm);\ - BITBLK256V32_56(ip, 7, op, parm); IPPE(ip); OPPE(op += 56*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_57(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_57(ip, op, parm) {\ - BITBLK256V32_57(ip, 0, op, parm); IPPE(ip); OPPE(op += 57*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_58(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_58(ip, op, parm) {\ - BITBLK256V32_58(ip, 0, op, parm);\ - BITBLK256V32_58(ip, 1, op, parm); IPPE(ip); OPPE(op += 58*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_59(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_59(ip, op, parm) {\ - BITBLK256V32_59(ip, 0, op, parm); IPPE(ip); OPPE(op += 59*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_60(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*8+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*8+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 1, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*8+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 2, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*8+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 3, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*8+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 4, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*8+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 5, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*8+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 6, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*8+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*8+ 7, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_60(ip, op, parm) {\ - BITBLK256V32_60(ip, 0, op, parm);\ - BITBLK256V32_60(ip, 1, op, parm);\ - BITBLK256V32_60(ip, 2, op, parm);\ - BITBLK256V32_60(ip, 3, op, parm); IPPE(ip); OPPE(op += 60*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_61(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_61(ip, op, parm) {\ - BITBLK256V32_61(ip, 0, op, parm); IPPE(ip); OPPE(op += 61*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_62(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*16+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*16+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 1, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*16+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 2, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*16+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 3, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*16+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 4, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*16+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 5, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*16+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 6, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*16+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 7, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*16+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 8, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*16+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+ 9, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*16+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+10, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*16+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+11, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*16+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+12, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*16+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+13, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*16+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+14, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*16+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*16+15, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_62(ip, op, parm) {\ - BITBLK256V32_62(ip, 0, op, parm);\ - BITBLK256V32_62(ip, 1, op, parm); IPPE(ip); OPPE(op += 62*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_63(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*32+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32);\ - VSTI(ip, i*32+ 1, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 1, iv), 31)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 1);\ - VSTI(ip, i*32+ 2, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 2, iv), 30)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 2);\ - VSTI(ip, i*32+ 3, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 3, iv), 29)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 3);\ - VSTI(ip, i*32+ 4, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 4, iv), 28)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 4);\ - VSTI(ip, i*32+ 5, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 5, iv), 27)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 5);\ - VSTI(ip, i*32+ 6, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 6, iv), 26)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 6);\ - VSTI(ip, i*32+ 7, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 7, iv), 25)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 7);\ - VSTI(ip, i*32+ 8, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 8, iv), 24)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 8);\ - VSTI(ip, i*32+ 9, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+ 9, iv), 23)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 9);\ - VSTI(ip, i*32+10, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+10, iv), 22)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 10);\ - VSTI(ip, i*32+11, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+11, iv), 21)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 11);\ - VSTI(ip, i*32+12, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+12, iv), 20)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 12);\ - VSTI(ip, i*32+13, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+13, iv), 19)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 13);\ - VSTI(ip, i*32+14, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+14, iv), 18)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 14);\ - VSTI(ip, i*32+15, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+15, iv), 17)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 15);\ - VSTI(ip, i*32+16, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+16, iv), 16)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 16);\ - VSTI(ip, i*32+17, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+17, iv), 15)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 17);\ - VSTI(ip, i*32+18, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+18, iv), 14)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 18);\ - VSTI(ip, i*32+19, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+19, iv), 13)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 19);\ - VSTI(ip, i*32+20, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+20, iv), 12)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 20);\ - VSTI(ip, i*32+21, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+21, iv), 11)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 21);\ - VSTI(ip, i*32+22, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+22, iv), 10)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 22);\ - VSTI(ip, i*32+23, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+23, iv), 9)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 23);\ - VSTI(ip, i*32+24, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+24, iv), 8)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 24);\ - VSTI(ip, i*32+25, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+25, iv), 7)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 25);\ - VSTI(ip, i*32+26, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+26, iv), 6)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 26);\ - VSTI(ip, i*32+27, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+27, iv), 5)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 27);\ - VSTI(ip, i*32+28, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+28, iv), 4)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 28);\ - VSTI(ip, i*32+29, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+29, iv), 3)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 29);\ - VSTI(ip, i*32+30, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+30, iv), 2)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 30);\ - VSTI(ip, i*32+31, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*32+31, iv), 1)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 31); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_63(ip, op, parm) {\ - BITBLK256V32_63(ip, 0, op, parm); IPPE(ip); OPPE(op += 63*4/sizeof(op[0]));\ -} - -#define BITBLK256V32_64(ip, i, op, parm) { __m256i ov,iv;\ - VSTI(ip, i*1+ 0, iv, parm); ov = _mm256_or_si256(ov, _mm256_slli_epi32(iv = IPP(ip, i*1+ 0, iv), 0)); _mm256_storeu_si256(op++, ov); ov = _mm256_srli_epi32(iv, 32); _mm256_storeu_si256((__m128i *)op++, ov);\ -} - -#define BITPACK256V32_64(ip, op, parm) {\ - BITBLK256V32_64(ip, 0, op, parm);\ - BITBLK256V32_64(ip, 1, op, parm);\ - BITBLK256V32_64(ip, 2, op, parm);\ - BITBLK256V32_64(ip, 3, op, parm);\ - BITBLK256V32_64(ip, 4, op, parm);\ - BITBLK256V32_64(ip, 5, op, parm);\ - BITBLK256V32_64(ip, 6, op, parm);\ - BITBLK256V32_64(ip, 7, op, parm);\ - BITBLK256V32_64(ip, 8, op, parm);\ - BITBLK256V32_64(ip, 9, op, parm);\ - BITBLK256V32_64(ip, 10, op, parm);\ - BITBLK256V32_64(ip, 11, op, parm);\ - BITBLK256V32_64(ip, 12, op, parm);\ - BITBLK256V32_64(ip, 13, op, parm);\ - BITBLK256V32_64(ip, 14, op, parm);\ - BITBLK256V32_64(ip, 15, op, parm);\ - BITBLK256V32_64(ip, 16, op, parm);\ - BITBLK256V32_64(ip, 17, op, parm);\ - BITBLK256V32_64(ip, 18, op, parm);\ - BITBLK256V32_64(ip, 19, op, parm);\ - BITBLK256V32_64(ip, 20, op, parm);\ - BITBLK256V32_64(ip, 21, op, parm);\ - BITBLK256V32_64(ip, 22, op, parm);\ - BITBLK256V32_64(ip, 23, op, parm);\ - BITBLK256V32_64(ip, 24, op, parm);\ - BITBLK256V32_64(ip, 25, op, parm);\ - BITBLK256V32_64(ip, 26, op, parm);\ - BITBLK256V32_64(ip, 27, op, parm);\ - BITBLK256V32_64(ip, 28, op, parm);\ - BITBLK256V32_64(ip, 29, op, parm);\ - BITBLK256V32_64(ip, 30, op, parm);\ - BITBLK256V32_64(ip, 31, op, parm); IPPE(ip); OPPE(op += 64*4/sizeof(op[0]));\ -} - -#define BITPACK256V32(__pip, __nbits, __pop, __parm) { __m256i *__ip=(__m256i *)__pip,*__op=(__m256i *)__pop;\ - switch(__nbits) {\ - case 0: break;\ - case 1:{ BITPACK256V32_1( __ip, __op, __parm); } break;\ - case 2:{ BITPACK256V32_2( __ip, __op, __parm); } break;\ - case 3:{ BITPACK256V32_3( __ip, __op, __parm); } break;\ - case 4:{ BITPACK256V32_4( __ip, __op, __parm); } break;\ - case 5:{ BITPACK256V32_5( __ip, __op, __parm); } break;\ - case 6:{ BITPACK256V32_6( __ip, __op, __parm); } break;\ - case 7:{ BITPACK256V32_7( __ip, __op, __parm); } break;\ - case 8:{ BITPACK256V32_8( __ip, __op, __parm); } break;\ - case 9:{ BITPACK256V32_9( __ip, __op, __parm); } break;\ - case 10:{ BITPACK256V32_10(__ip, __op, __parm); } break;\ - case 11:{ BITPACK256V32_11(__ip, __op, __parm); } break;\ - case 12:{ BITPACK256V32_12(__ip, __op, __parm); } break;\ - case 13:{ BITPACK256V32_13(__ip, __op, __parm); } break;\ - case 14:{ BITPACK256V32_14(__ip, __op, __parm); } break;\ - case 15:{ BITPACK256V32_15(__ip, __op, __parm); } break;\ - case 16:{ BITPACK256V32_16(__ip, __op, __parm); } break;\ - case 17:{ BITPACK256V32_17(__ip, __op, __parm); } break;\ - case 18:{ BITPACK256V32_18(__ip, __op, __parm); } break;\ - case 19:{ BITPACK256V32_19(__ip, __op, __parm); } break;\ - case 20:{ BITPACK256V32_20(__ip, __op, __parm); } break;\ - case 21:{ BITPACK256V32_21(__ip, __op, __parm); } break;\ - case 22:{ BITPACK256V32_22(__ip, __op, __parm); } break;\ - case 23:{ BITPACK256V32_23(__ip, __op, __parm); } break;\ - case 24:{ BITPACK256V32_24(__ip, __op, __parm); } break;\ - case 25:{ BITPACK256V32_25(__ip, __op, __parm); } break;\ - case 26:{ BITPACK256V32_26(__ip, __op, __parm); } break;\ - case 27:{ BITPACK256V32_27(__ip, __op, __parm); } break;\ - case 28:{ BITPACK256V32_28(__ip, __op, __parm); } break;\ - case 29:{ BITPACK256V32_29(__ip, __op, __parm); } break;\ - case 30:{ BITPACK256V32_30(__ip, __op, __parm); } break;\ - case 31:{ BITPACK256V32_31(__ip, __op, __parm); } break;\ - case 32:{ BITPACK256V32_32(__ip, __op, __parm); } break;\ - }\ -} - diff --git a/bitpack64_.h b/bitpack64_.h deleted file mode 100644 index ebfbaa3..0000000 --- a/bitpack64_.h +++ /dev/null @@ -1,2259 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// bitpack include -#define BITBLK32_1(ip, i, op, parm) { ; register uint32_t w;;\ - IPPB(ip, i*32+ 0, parm); w = (uint32_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); w |= (uint32_t)SRC(ip, i*32+ 1) << 1;\ - IPPB(ip, i*32+ 2, parm); w |= (uint32_t)SRC(ip, i*32+ 2) << 2;\ - IPPB(ip, i*32+ 3, parm); w |= (uint32_t)SRC(ip, i*32+ 3) << 3;\ - IPPB(ip, i*32+ 4, parm); w |= (uint32_t)SRC(ip, i*32+ 4) << 4;\ - IPPB(ip, i*32+ 5, parm); w |= (uint32_t)SRC(ip, i*32+ 5) << 5;\ - IPPB(ip, i*32+ 6, parm); w |= (uint32_t)SRC(ip, i*32+ 6) << 6;\ - IPPB(ip, i*32+ 7, parm); w |= (uint32_t)SRC(ip, i*32+ 7) << 7;\ - IPPB(ip, i*32+ 8, parm); w |= (uint32_t)SRC(ip, i*32+ 8) << 8;\ - IPPB(ip, i*32+ 9, parm); w |= (uint32_t)SRC(ip, i*32+ 9) << 9;\ - IPPB(ip, i*32+10, parm); w |= (uint32_t)SRC(ip, i*32+10) << 10;\ - IPPB(ip, i*32+11, parm); w |= (uint32_t)SRC(ip, i*32+11) << 11;\ - IPPB(ip, i*32+12, parm); w |= (uint32_t)SRC(ip, i*32+12) << 12;\ - IPPB(ip, i*32+13, parm); w |= (uint32_t)SRC(ip, i*32+13) << 13;\ - IPPB(ip, i*32+14, parm); w |= (uint32_t)SRC(ip, i*32+14) << 14;\ - IPPB(ip, i*32+15, parm); w |= (uint32_t)SRC(ip, i*32+15) << 15;\ - IPPB(ip, i*32+16, parm); w |= (uint32_t)SRC(ip, i*32+16) << 16;\ - IPPB(ip, i*32+17, parm); w |= (uint32_t)SRC(ip, i*32+17) << 17;\ - IPPB(ip, i*32+18, parm); w |= (uint32_t)SRC(ip, i*32+18) << 18;\ - IPPB(ip, i*32+19, parm); w |= (uint32_t)SRC(ip, i*32+19) << 19;\ - IPPB(ip, i*32+20, parm); w |= (uint32_t)SRC(ip, i*32+20) << 20;\ - IPPB(ip, i*32+21, parm); w |= (uint32_t)SRC(ip, i*32+21) << 21;\ - IPPB(ip, i*32+22, parm); w |= (uint32_t)SRC(ip, i*32+22) << 22;\ - IPPB(ip, i*32+23, parm); w |= (uint32_t)SRC(ip, i*32+23) << 23;\ - IPPB(ip, i*32+24, parm); w |= (uint32_t)SRC(ip, i*32+24) << 24;\ - IPPB(ip, i*32+25, parm); w |= (uint32_t)SRC(ip, i*32+25) << 25;\ - IPPB(ip, i*32+26, parm); w |= (uint32_t)SRC(ip, i*32+26) << 26;\ - IPPB(ip, i*32+27, parm); w |= (uint32_t)SRC(ip, i*32+27) << 27;\ - IPPB(ip, i*32+28, parm); w |= (uint32_t)SRC(ip, i*32+28) << 28;\ - IPPB(ip, i*32+29, parm); w |= (uint32_t)SRC(ip, i*32+29) << 29;\ - IPPB(ip, i*32+30, parm); w |= (uint32_t)SRC(ip, i*32+30) << 30;\ - IPPB(ip, i*32+31, parm); w |= (uint32_t)SRC(ip, i*32+31) << 31;*((uint32_t *)op+i*1+ 0) = w;;\ -} - -#define BITPACK64_1(ip, op, parm) { \ - BITBLK32_1(ip, 0, op, parm); SRCI(ip); op += 1*4/sizeof(op[0]);\ -} - -#define BITBLK64_2(ip, i, op, parm) { ; register uint64_t w;;\ - IPPB(ip, i*32+ 0, parm); w = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); w |= (uint64_t)SRC(ip, i*32+ 1) << 2;\ - IPPB(ip, i*32+ 2, parm); w |= (uint64_t)SRC(ip, i*32+ 2) << 4;\ - IPPB(ip, i*32+ 3, parm); w |= (uint64_t)SRC(ip, i*32+ 3) << 6;\ - IPPB(ip, i*32+ 4, parm); w |= (uint64_t)SRC(ip, i*32+ 4) << 8;\ - IPPB(ip, i*32+ 5, parm); w |= (uint64_t)SRC(ip, i*32+ 5) << 10;\ - IPPB(ip, i*32+ 6, parm); w |= (uint64_t)SRC(ip, i*32+ 6) << 12;\ - IPPB(ip, i*32+ 7, parm); w |= (uint64_t)SRC(ip, i*32+ 7) << 14;\ - IPPB(ip, i*32+ 8, parm); w |= (uint64_t)SRC(ip, i*32+ 8) << 16;\ - IPPB(ip, i*32+ 9, parm); w |= (uint64_t)SRC(ip, i*32+ 9) << 18;\ - IPPB(ip, i*32+10, parm); w |= (uint64_t)SRC(ip, i*32+10) << 20;\ - IPPB(ip, i*32+11, parm); w |= (uint64_t)SRC(ip, i*32+11) << 22;\ - IPPB(ip, i*32+12, parm); w |= (uint64_t)SRC(ip, i*32+12) << 24;\ - IPPB(ip, i*32+13, parm); w |= (uint64_t)SRC(ip, i*32+13) << 26;\ - IPPB(ip, i*32+14, parm); w |= (uint64_t)SRC(ip, i*32+14) << 28;\ - IPPB(ip, i*32+15, parm); w |= (uint64_t)SRC(ip, i*32+15) << 30;\ - IPPB(ip, i*32+16, parm); w |= (uint64_t)SRC(ip, i*32+16) << 32;\ - IPPB(ip, i*32+17, parm); w |= (uint64_t)SRC(ip, i*32+17) << 34;\ - IPPB(ip, i*32+18, parm); w |= (uint64_t)SRC(ip, i*32+18) << 36;\ - IPPB(ip, i*32+19, parm); w |= (uint64_t)SRC(ip, i*32+19) << 38;\ - IPPB(ip, i*32+20, parm); w |= (uint64_t)SRC(ip, i*32+20) << 40;\ - IPPB(ip, i*32+21, parm); w |= (uint64_t)SRC(ip, i*32+21) << 42;\ - IPPB(ip, i*32+22, parm); w |= (uint64_t)SRC(ip, i*32+22) << 44;\ - IPPB(ip, i*32+23, parm); w |= (uint64_t)SRC(ip, i*32+23) << 46;\ - IPPB(ip, i*32+24, parm); w |= (uint64_t)SRC(ip, i*32+24) << 48;\ - IPPB(ip, i*32+25, parm); w |= (uint64_t)SRC(ip, i*32+25) << 50;\ - IPPB(ip, i*32+26, parm); w |= (uint64_t)SRC(ip, i*32+26) << 52;\ - IPPB(ip, i*32+27, parm); w |= (uint64_t)SRC(ip, i*32+27) << 54;\ - IPPB(ip, i*32+28, parm); w |= (uint64_t)SRC(ip, i*32+28) << 56;\ - IPPB(ip, i*32+29, parm); w |= (uint64_t)SRC(ip, i*32+29) << 58;\ - IPPB(ip, i*32+30, parm); w |= (uint64_t)SRC(ip, i*32+30) << 60;\ - IPPB(ip, i*32+31, parm); w |= (uint64_t)SRC(ip, i*32+31) << 62;*((uint64_t *)op+i*1+ 0) = w;;\ -} - -#define BITPACK64_2(ip, op, parm) { \ - BITBLK64_2(ip, 0, op, parm); SRCI(ip); op += 2*4/sizeof(op[0]);\ -} - -#define BITBLK64_3(ip, i, op, parm) { ; register uint64_t w;;\ - IPPB(ip, i*64+ 0, parm); w = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); w |= (uint64_t)SRC(ip, i*64+ 1) << 3;\ - IPPB(ip, i*64+ 2, parm); w |= (uint64_t)SRC(ip, i*64+ 2) << 6;\ - IPPB(ip, i*64+ 3, parm); w |= (uint64_t)SRC(ip, i*64+ 3) << 9;\ - IPPB(ip, i*64+ 4, parm); w |= (uint64_t)SRC(ip, i*64+ 4) << 12;\ - IPPB(ip, i*64+ 5, parm); w |= (uint64_t)SRC(ip, i*64+ 5) << 15;\ - IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 18;\ - IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 21;\ - IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ - IPPB(ip, i*64+ 9, parm); w |= (uint64_t)SRC(ip, i*64+ 9) << 27;\ - IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 30;\ - IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 33;\ - IPPB(ip, i*64+12, parm); w |= (uint64_t)SRC(ip, i*64+12) << 36;\ - IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 39;\ - IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 42;\ - IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 45;\ - IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 48;\ - IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 51;\ - IPPB(ip, i*64+18, parm); w |= (uint64_t)SRC(ip, i*64+18) << 54;\ - IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 57;\ - IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 60 | (uint64_t)SRC1(ip, i*64+21) << 63;*((uint64_t *)op+i*3+ 0) = w;\ - IPPB(ip, i*64+21, parm); w = (uint64_t)SRC(ip, i*64+21) >> 1;\ - IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 2;\ - IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 5;\ - IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 8;\ - IPPB(ip, i*64+25, parm); w |= (uint64_t)SRC(ip, i*64+25) << 11;\ - IPPB(ip, i*64+26, parm); w |= (uint64_t)SRC(ip, i*64+26) << 14;\ - IPPB(ip, i*64+27, parm); w |= (uint64_t)SRC(ip, i*64+27) << 17;\ - IPPB(ip, i*64+28, parm); w |= (uint64_t)SRC(ip, i*64+28) << 20;\ - IPPB(ip, i*64+29, parm); w |= (uint64_t)SRC(ip, i*64+29) << 23;\ - IPPB(ip, i*64+30, parm); w |= (uint64_t)SRC(ip, i*64+30) << 26;\ - IPPB(ip, i*64+31, parm); w |= (uint64_t)SRC(ip, i*64+31) << 29;*((uint64_t *)op+i*3+ 1) = w;;\ -} - -#define BITPACK64_3(ip, op, parm) { \ - BITBLK64_3(ip, 0, op, parm); SRCI(ip); op += 3*4/sizeof(op[0]);\ -} - -#define BITBLK64_4(ip, i, op, parm) { ; register uint64_t w;;\ - IPPB(ip, i*16+ 0, parm); w = (uint64_t)SRC(ip, i*16+ 0) ;\ - IPPB(ip, i*16+ 1, parm); w |= (uint64_t)SRC(ip, i*16+ 1) << 4;\ - IPPB(ip, i*16+ 2, parm); w |= (uint64_t)SRC(ip, i*16+ 2) << 8;\ - IPPB(ip, i*16+ 3, parm); w |= (uint64_t)SRC(ip, i*16+ 3) << 12;\ - IPPB(ip, i*16+ 4, parm); w |= (uint64_t)SRC(ip, i*16+ 4) << 16;\ - IPPB(ip, i*16+ 5, parm); w |= (uint64_t)SRC(ip, i*16+ 5) << 20;\ - IPPB(ip, i*16+ 6, parm); w |= (uint64_t)SRC(ip, i*16+ 6) << 24;\ - IPPB(ip, i*16+ 7, parm); w |= (uint64_t)SRC(ip, i*16+ 7) << 28;\ - IPPB(ip, i*16+ 8, parm); w |= (uint64_t)SRC(ip, i*16+ 8) << 32;\ - IPPB(ip, i*16+ 9, parm); w |= (uint64_t)SRC(ip, i*16+ 9) << 36;\ - IPPB(ip, i*16+10, parm); w |= (uint64_t)SRC(ip, i*16+10) << 40;\ - IPPB(ip, i*16+11, parm); w |= (uint64_t)SRC(ip, i*16+11) << 44;\ - IPPB(ip, i*16+12, parm); w |= (uint64_t)SRC(ip, i*16+12) << 48;\ - IPPB(ip, i*16+13, parm); w |= (uint64_t)SRC(ip, i*16+13) << 52;\ - IPPB(ip, i*16+14, parm); w |= (uint64_t)SRC(ip, i*16+14) << 56;\ - IPPB(ip, i*16+15, parm); w |= (uint64_t)SRC(ip, i*16+15) << 60;*((uint64_t *)op+i*1+ 0) = w;;\ -} - -#define BITPACK64_4(ip, op, parm) { \ - BITBLK64_4(ip, 0, op, parm);\ - BITBLK64_4(ip, 1, op, parm); SRCI(ip); op += 4*4/sizeof(op[0]);\ -} - -#define BITBLK64_5(ip, i, op, parm) { ; register uint64_t w;;\ - IPPB(ip, i*64+ 0, parm); w = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); w |= (uint64_t)SRC(ip, i*64+ 1) << 5;\ - IPPB(ip, i*64+ 2, parm); w |= (uint64_t)SRC(ip, i*64+ 2) << 10;\ - IPPB(ip, i*64+ 3, parm); w |= (uint64_t)SRC(ip, i*64+ 3) << 15;\ - IPPB(ip, i*64+ 4, parm); w |= (uint64_t)SRC(ip, i*64+ 4) << 20;\ - IPPB(ip, i*64+ 5, parm); w |= (uint64_t)SRC(ip, i*64+ 5) << 25;\ - IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 30;\ - IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 35;\ - IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 40;\ - IPPB(ip, i*64+ 9, parm); w |= (uint64_t)SRC(ip, i*64+ 9) << 45;\ - IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 50;\ - IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 55 | (uint64_t)SRC1(ip, i*64+12) << 60;*((uint64_t *)op+i*5+ 0) = w;\ - IPPB(ip, i*64+12, parm); w = (uint64_t)SRC(ip, i*64+12) >> 4;\ - IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 1;\ - IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 6;\ - IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 11;\ - IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 16;\ - IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 21;\ - IPPB(ip, i*64+18, parm); w |= (uint64_t)SRC(ip, i*64+18) << 26;\ - IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 31;\ - IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 36;\ - IPPB(ip, i*64+21, parm); w |= (uint64_t)SRC(ip, i*64+21) << 41;\ - IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 46;\ - IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 51;\ - IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 56 | (uint64_t)SRC1(ip, i*64+25) << 61;*((uint64_t *)op+i*5+ 1) = w;\ - IPPB(ip, i*64+25, parm); w = (uint64_t)SRC(ip, i*64+25) >> 3;\ - IPPB(ip, i*64+26, parm); w |= (uint64_t)SRC(ip, i*64+26) << 2;\ - IPPB(ip, i*64+27, parm); w |= (uint64_t)SRC(ip, i*64+27) << 7;\ - IPPB(ip, i*64+28, parm); w |= (uint64_t)SRC(ip, i*64+28) << 12;\ - IPPB(ip, i*64+29, parm); w |= (uint64_t)SRC(ip, i*64+29) << 17;\ - IPPB(ip, i*64+30, parm); w |= (uint64_t)SRC(ip, i*64+30) << 22;\ - IPPB(ip, i*64+31, parm); w |= (uint64_t)SRC(ip, i*64+31) << 27;*((uint64_t *)op+i*5+ 2) = w;;\ -} - -#define BITPACK64_5(ip, op, parm) { \ - BITBLK64_5(ip, 0, op, parm); SRCI(ip); op += 5*4/sizeof(op[0]);\ -} - -#define BITBLK64_6(ip, i, op, parm) { ; register uint64_t w;;\ - IPPB(ip, i*32+ 0, parm); w = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); w |= (uint64_t)SRC(ip, i*32+ 1) << 6;\ - IPPB(ip, i*32+ 2, parm); w |= (uint64_t)SRC(ip, i*32+ 2) << 12;\ - IPPB(ip, i*32+ 3, parm); w |= (uint64_t)SRC(ip, i*32+ 3) << 18;\ - IPPB(ip, i*32+ 4, parm); w |= (uint64_t)SRC(ip, i*32+ 4) << 24;\ - IPPB(ip, i*32+ 5, parm); w |= (uint64_t)SRC(ip, i*32+ 5) << 30;\ - IPPB(ip, i*32+ 6, parm); w |= (uint64_t)SRC(ip, i*32+ 6) << 36;\ - IPPB(ip, i*32+ 7, parm); w |= (uint64_t)SRC(ip, i*32+ 7) << 42;\ - IPPB(ip, i*32+ 8, parm); w |= (uint64_t)SRC(ip, i*32+ 8) << 48;\ - IPPB(ip, i*32+ 9, parm); w |= (uint64_t)SRC(ip, i*32+ 9) << 54 | (uint64_t)SRC1(ip, i*32+10) << 60;*((uint64_t *)op+i*3+ 0) = w;\ - IPPB(ip, i*32+10, parm); w = (uint64_t)SRC(ip, i*32+10) >> 4;\ - IPPB(ip, i*32+11, parm); w |= (uint64_t)SRC(ip, i*32+11) << 2;\ - IPPB(ip, i*32+12, parm); w |= (uint64_t)SRC(ip, i*32+12) << 8;\ - IPPB(ip, i*32+13, parm); w |= (uint64_t)SRC(ip, i*32+13) << 14;\ - IPPB(ip, i*32+14, parm); w |= (uint64_t)SRC(ip, i*32+14) << 20;\ - IPPB(ip, i*32+15, parm); w |= (uint64_t)SRC(ip, i*32+15) << 26;\ - IPPB(ip, i*32+16, parm); w |= (uint64_t)SRC(ip, i*32+16) << 32;\ - IPPB(ip, i*32+17, parm); w |= (uint64_t)SRC(ip, i*32+17) << 38;\ - IPPB(ip, i*32+18, parm); w |= (uint64_t)SRC(ip, i*32+18) << 44;\ - IPPB(ip, i*32+19, parm); w |= (uint64_t)SRC(ip, i*32+19) << 50;\ - IPPB(ip, i*32+20, parm); w |= (uint64_t)SRC(ip, i*32+20) << 56 | (uint64_t)SRC1(ip, i*32+21) << 62;*((uint64_t *)op+i*3+ 1) = w;\ - IPPB(ip, i*32+21, parm); w = (uint64_t)SRC(ip, i*32+21) >> 2;\ - IPPB(ip, i*32+22, parm); w |= (uint64_t)SRC(ip, i*32+22) << 4;\ - IPPB(ip, i*32+23, parm); w |= (uint64_t)SRC(ip, i*32+23) << 10;\ - IPPB(ip, i*32+24, parm); w |= (uint64_t)SRC(ip, i*32+24) << 16;\ - IPPB(ip, i*32+25, parm); w |= (uint64_t)SRC(ip, i*32+25) << 22;\ - IPPB(ip, i*32+26, parm); w |= (uint64_t)SRC(ip, i*32+26) << 28;\ - IPPB(ip, i*32+27, parm); w |= (uint64_t)SRC(ip, i*32+27) << 34;\ - IPPB(ip, i*32+28, parm); w |= (uint64_t)SRC(ip, i*32+28) << 40;\ - IPPB(ip, i*32+29, parm); w |= (uint64_t)SRC(ip, i*32+29) << 46;\ - IPPB(ip, i*32+30, parm); w |= (uint64_t)SRC(ip, i*32+30) << 52;\ - IPPB(ip, i*32+31, parm); w |= (uint64_t)SRC(ip, i*32+31) << 58;*((uint64_t *)op+i*3+ 2) = w;;\ -} - -#define BITPACK64_6(ip, op, parm) { \ - BITBLK64_6(ip, 0, op, parm); SRCI(ip); op += 6*4/sizeof(op[0]);\ -} - -#define BITBLK64_7(ip, i, op, parm) { ; register uint64_t w;;\ - IPPB(ip, i*64+ 0, parm); w = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); w |= (uint64_t)SRC(ip, i*64+ 1) << 7;\ - IPPB(ip, i*64+ 2, parm); w |= (uint64_t)SRC(ip, i*64+ 2) << 14;\ - IPPB(ip, i*64+ 3, parm); w |= (uint64_t)SRC(ip, i*64+ 3) << 21;\ - IPPB(ip, i*64+ 4, parm); w |= (uint64_t)SRC(ip, i*64+ 4) << 28;\ - IPPB(ip, i*64+ 5, parm); w |= (uint64_t)SRC(ip, i*64+ 5) << 35;\ - IPPB(ip, i*64+ 6, parm); w |= (uint64_t)SRC(ip, i*64+ 6) << 42;\ - IPPB(ip, i*64+ 7, parm); w |= (uint64_t)SRC(ip, i*64+ 7) << 49;\ - IPPB(ip, i*64+ 8, parm); w |= (uint64_t)SRC(ip, i*64+ 8) << 56 | (uint64_t)SRC1(ip, i*64+9) << 63;*((uint64_t *)op+i*7+ 0) = w;\ - IPPB(ip, i*64+ 9, parm); w = (uint64_t)SRC(ip, i*64+ 9) >> 1;\ - IPPB(ip, i*64+10, parm); w |= (uint64_t)SRC(ip, i*64+10) << 6;\ - IPPB(ip, i*64+11, parm); w |= (uint64_t)SRC(ip, i*64+11) << 13;\ - IPPB(ip, i*64+12, parm); w |= (uint64_t)SRC(ip, i*64+12) << 20;\ - IPPB(ip, i*64+13, parm); w |= (uint64_t)SRC(ip, i*64+13) << 27;\ - IPPB(ip, i*64+14, parm); w |= (uint64_t)SRC(ip, i*64+14) << 34;\ - IPPB(ip, i*64+15, parm); w |= (uint64_t)SRC(ip, i*64+15) << 41;\ - IPPB(ip, i*64+16, parm); w |= (uint64_t)SRC(ip, i*64+16) << 48;\ - IPPB(ip, i*64+17, parm); w |= (uint64_t)SRC(ip, i*64+17) << 55 | (uint64_t)SRC1(ip, i*64+18) << 62;*((uint64_t *)op+i*7+ 1) = w;\ - IPPB(ip, i*64+18, parm); w = (uint64_t)SRC(ip, i*64+18) >> 2;\ - IPPB(ip, i*64+19, parm); w |= (uint64_t)SRC(ip, i*64+19) << 5;\ - IPPB(ip, i*64+20, parm); w |= (uint64_t)SRC(ip, i*64+20) << 12;\ - IPPB(ip, i*64+21, parm); w |= (uint64_t)SRC(ip, i*64+21) << 19;\ - IPPB(ip, i*64+22, parm); w |= (uint64_t)SRC(ip, i*64+22) << 26;\ - IPPB(ip, i*64+23, parm); w |= (uint64_t)SRC(ip, i*64+23) << 33;\ - IPPB(ip, i*64+24, parm); w |= (uint64_t)SRC(ip, i*64+24) << 40;\ - IPPB(ip, i*64+25, parm); w |= (uint64_t)SRC(ip, i*64+25) << 47;\ - IPPB(ip, i*64+26, parm); w |= (uint64_t)SRC(ip, i*64+26) << 54 | (uint64_t)SRC1(ip, i*64+27) << 61;*((uint64_t *)op+i*7+ 2) = w;\ - IPPB(ip, i*64+27, parm); w = (uint64_t)SRC(ip, i*64+27) >> 3;\ - IPPB(ip, i*64+28, parm); w |= (uint64_t)SRC(ip, i*64+28) << 4;\ - IPPB(ip, i*64+29, parm); w |= (uint64_t)SRC(ip, i*64+29) << 11;\ - IPPB(ip, i*64+30, parm); w |= (uint64_t)SRC(ip, i*64+30) << 18;\ - IPPB(ip, i*64+31, parm); w |= (uint64_t)SRC(ip, i*64+31) << 25;*((uint64_t *)op+i*7+ 3) = w;;\ -} - -#define BITPACK64_7(ip, op, parm) { \ - BITBLK64_7(ip, 0, op, parm); SRCI(ip); op += 7*4/sizeof(op[0]);\ -} - -#define BITBLK64_8(ip, i, op, parm) { ;\ - IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*1+ 0) = (uint64_t)SRC(ip, i*8+ 0) ;\ - IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 1) << 8;\ - IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 2) << 16;\ - IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 3) << 24;\ - IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 4) << 32;\ - IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 5) << 40;\ - IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 6) << 48;\ - IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*1+ 0) |= (uint64_t)SRC(ip, i*8+ 7) << 56;\ -} - -#define BITPACK64_8(ip, op, parm) { \ - BITBLK64_8(ip, 0, op, parm);\ - BITBLK64_8(ip, 1, op, parm);\ - BITBLK64_8(ip, 2, op, parm);\ - BITBLK64_8(ip, 3, op, parm); SRCI(ip); op += 8*4/sizeof(op[0]);\ -} - -#define BITBLK64_9(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 9;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 18;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 27;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 4) << 36;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 5) << 45;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*64+ 6) << 54 | (uint64_t)SRC1(ip, i*64+7) << 63;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*9+ 1) = (uint64_t)SRC(ip, i*64+ 7) >> 1;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 8;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+ 9) << 17;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+10) << 26;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+11) << 35;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+12) << 44;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*64+13) << 53 | (uint64_t)SRC1(ip, i*64+14) << 62;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*9+ 2) = (uint64_t)SRC(ip, i*64+14) >> 2;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+15) << 7;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+16) << 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+17) << 25;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+18) << 34;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+19) << 43;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*64+20) << 52 | (uint64_t)SRC1(ip, i*64+21) << 61;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*9+ 3) = (uint64_t)SRC(ip, i*64+21) >> 3;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+22) << 6;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+23) << 15;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+24) << 24;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+25) << 33;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+26) << 42;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*64+27) << 51 | (uint64_t)SRC1(ip, i*64+28) << 60;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*9+ 4) = (uint64_t)SRC(ip, i*64+28) >> 4;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*64+29) << 5;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*64+30) << 14;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*64+31) << 23;\ -} - -#define BITPACK64_9(ip, op, parm) { \ - BITBLK64_9(ip, 0, op, parm); SRCI(ip); op += 9*4/sizeof(op[0]);\ -} - -#define BITBLK64_10(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 10;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 20;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 3) << 30;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 4) << 40;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*32+ 5) << 50 | (uint64_t)SRC1(ip, i*32+6) << 60;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*5+ 1) = (uint64_t)SRC(ip, i*32+ 6) >> 4;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+ 7) << 6;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+ 8) << 16;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+ 9) << 26;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+10) << 36;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*32+11) << 46 | (uint64_t)SRC1(ip, i*32+12) << 56;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*5+ 2) = (uint64_t)SRC(ip, i*32+12) >> 8;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+13) << 2;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+14) << 12;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+15) << 22;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+16) << 32;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+17) << 42;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*32+18) << 52 | (uint64_t)SRC1(ip, i*32+19) << 62;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*5+ 3) = (uint64_t)SRC(ip, i*32+19) >> 2;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+20) << 8;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+21) << 18;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+22) << 28;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+23) << 38;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*32+24) << 48 | (uint64_t)SRC1(ip, i*32+25) << 58;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*5+ 4) = (uint64_t)SRC(ip, i*32+25) >> 6;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+26) << 4;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+27) << 14;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+28) << 24;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+29) << 34;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+30) << 44;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*32+31) << 54;\ -} - -#define BITPACK64_10(ip, op, parm) { \ - BITBLK64_10(ip, 0, op, parm); SRCI(ip); op += 10*4/sizeof(op[0]);\ -} - -#define BITBLK64_11(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 11;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 22;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 33;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*64+ 4) << 44 | (uint64_t)SRC1(ip, i*64+5) << 55;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*11+ 1) = (uint64_t)SRC(ip, i*64+ 5) >> 9;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 2;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 13;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+ 9) << 35;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*64+10) << 46 | (uint64_t)SRC1(ip, i*64+11) << 57;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*11+ 2) = (uint64_t)SRC(ip, i*64+11) >> 7;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+12) << 4;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+13) << 15;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+14) << 26;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+15) << 37;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*64+16) << 48 | (uint64_t)SRC1(ip, i*64+17) << 59;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*11+ 3) = (uint64_t)SRC(ip, i*64+17) >> 5;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+18) << 6;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+19) << 17;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+20) << 28;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+21) << 39;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*64+22) << 50 | (uint64_t)SRC1(ip, i*64+23) << 61;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*11+ 4) = (uint64_t)SRC(ip, i*64+23) >> 3;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+24) << 8;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+25) << 19;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+26) << 30;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+27) << 41;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*64+28) << 52 | (uint64_t)SRC1(ip, i*64+29) << 63;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*11+ 5) = (uint64_t)SRC(ip, i*64+29) >> 1;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*11+ 5) |= (uint64_t)SRC(ip, i*64+30) << 10;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*11+ 5) |= (uint64_t)SRC(ip, i*64+31) << 21;\ -} - -#define BITPACK64_11(ip, op, parm) { \ - BITBLK64_11(ip, 0, op, parm); SRCI(ip); op += 11*4/sizeof(op[0]);\ -} - -#define BITBLK64_12(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint64_t)SRC(ip, i*16+ 0) ;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 12;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 2) << 24;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 3) << 36;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*16+ 4) << 48 | (uint64_t)SRC1(ip, i*16+5) << 60;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*3+ 1) = (uint64_t)SRC(ip, i*16+ 5) >> 4;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 6) << 8;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 7) << 20;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 8) << 32;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*16+ 9) << 44 | (uint64_t)SRC1(ip, i*16+10) << 56;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*3+ 2) = (uint64_t)SRC(ip, i*16+10) >> 8;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+11) << 4;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+12) << 16;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+13) << 28;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+14) << 40;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*16+15) << 52;\ -} - -#define BITPACK64_12(ip, op, parm) { \ - BITBLK64_12(ip, 0, op, parm);\ - BITBLK64_12(ip, 1, op, parm); SRCI(ip); op += 12*4/sizeof(op[0]);\ -} - -#define BITBLK64_13(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 13;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 26;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 39 | (uint64_t)SRC1(ip, i*64+4) << 52;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*13+ 1) = (uint64_t)SRC(ip, i*64+ 4) >> 12;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 1;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 14;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 27;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*64+ 8) << 40 | (uint64_t)SRC1(ip, i*64+9) << 53;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*13+ 2) = (uint64_t)SRC(ip, i*64+ 9) >> 11;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+10) << 2;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+11) << 15;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+12) << 28;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*64+13) << 41 | (uint64_t)SRC1(ip, i*64+14) << 54;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*13+ 3) = (uint64_t)SRC(ip, i*64+14) >> 10;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+15) << 3;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+16) << 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+17) << 29;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*64+18) << 42 | (uint64_t)SRC1(ip, i*64+19) << 55;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*13+ 4) = (uint64_t)SRC(ip, i*64+19) >> 9;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+20) << 4;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+21) << 17;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+22) << 30;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*64+23) << 43 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*13+ 5) = (uint64_t)SRC(ip, i*64+24) >> 8;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+25) << 5;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+26) << 18;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+27) << 31;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*64+28) << 44 | (uint64_t)SRC1(ip, i*64+29) << 57;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*13+ 6) = (uint64_t)SRC(ip, i*64+29) >> 7;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*13+ 6) |= (uint64_t)SRC(ip, i*64+30) << 6;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*13+ 6) |= (uint64_t)SRC(ip, i*64+31) << 19;\ -} - -#define BITPACK64_13(ip, op, parm) { \ - BITBLK64_13(ip, 0, op, parm); SRCI(ip); op += 13*4/sizeof(op[0]);\ -} - -#define BITBLK64_14(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 14;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 28;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*32+ 3) << 42 | (uint64_t)SRC1(ip, i*32+4) << 56;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*7+ 1) = (uint64_t)SRC(ip, i*32+ 4) >> 8;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 5) << 6;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 6) << 20;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 7) << 34;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*32+ 8) << 48 | (uint64_t)SRC1(ip, i*32+9) << 62;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*7+ 2) = (uint64_t)SRC(ip, i*32+ 9) >> 2;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+10) << 12;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+11) << 26;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*32+12) << 40 | (uint64_t)SRC1(ip, i*32+13) << 54;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*7+ 3) = (uint64_t)SRC(ip, i*32+13) >> 10;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+14) << 4;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+15) << 18;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+16) << 32;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*32+17) << 46 | (uint64_t)SRC1(ip, i*32+18) << 60;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*7+ 4) = (uint64_t)SRC(ip, i*32+18) >> 4;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+19) << 10;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+20) << 24;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*32+21) << 38 | (uint64_t)SRC1(ip, i*32+22) << 52;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*7+ 5) = (uint64_t)SRC(ip, i*32+22) >> 12;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+23) << 2;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+24) << 16;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+25) << 30;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*32+26) << 44 | (uint64_t)SRC1(ip, i*32+27) << 58;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*7+ 6) = (uint64_t)SRC(ip, i*32+27) >> 6;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+28) << 8;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+29) << 22;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+30) << 36;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*32+31) << 50;\ -} - -#define BITPACK64_14(ip, op, parm) { \ - BITBLK64_14(ip, 0, op, parm); SRCI(ip); op += 14*4/sizeof(op[0]);\ -} - -#define BITBLK64_15(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 15;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 30;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*64+ 3) << 45 | (uint64_t)SRC1(ip, i*64+4) << 60;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*15+ 1) = (uint64_t)SRC(ip, i*64+ 4) >> 4;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 11;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 26;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*64+ 7) << 41 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*15+ 2) = (uint64_t)SRC(ip, i*64+ 8) >> 8;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 7;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+10) << 22;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*64+11) << 37 | (uint64_t)SRC1(ip, i*64+12) << 52;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*15+ 3) = (uint64_t)SRC(ip, i*64+12) >> 12;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+13) << 3;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+14) << 18;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+15) << 33;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*64+16) << 48 | (uint64_t)SRC1(ip, i*64+17) << 63;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*15+ 4) = (uint64_t)SRC(ip, i*64+17) >> 1;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+18) << 14;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+19) << 29;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*64+20) << 44 | (uint64_t)SRC1(ip, i*64+21) << 59;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*15+ 5) = (uint64_t)SRC(ip, i*64+21) >> 5;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+22) << 10;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+23) << 25;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*64+24) << 40 | (uint64_t)SRC1(ip, i*64+25) << 55;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*15+ 6) = (uint64_t)SRC(ip, i*64+25) >> 9;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+26) << 6;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+27) << 21;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*64+28) << 36 | (uint64_t)SRC1(ip, i*64+29) << 51;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*15+ 7) = (uint64_t)SRC(ip, i*64+29) >> 13;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*15+ 7) |= (uint64_t)SRC(ip, i*64+30) << 2;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*15+ 7) |= (uint64_t)SRC(ip, i*64+31) << 17;\ -} - -#define BITPACK64_15(ip, op, parm) { \ - BITBLK64_15(ip, 0, op, parm); SRCI(ip); op += 15*4/sizeof(op[0]);\ -} - -#define BITBLK64_16(ip, i, op, parm) { \ - IPPB(ip, i*4+ 0, parm); *(uint16_t *)(op+i*8+ 0) = SRC(ip, i*4+ 0);\ - IPPB(ip, i*4+ 1, parm); *(uint16_t *)(op+i*8+ 2) = SRC(ip, i*4+ 1);\ - IPPB(ip, i*4+ 2, parm); *(uint16_t *)(op+i*8+ 4) = SRC(ip, i*4+ 2);\ - IPPB(ip, i*4+ 3, parm); *(uint16_t *)(op+i*8+ 6) = SRC(ip, i*4+ 3);;\ -} - -#define BITPACK64_16(ip, op, parm) { \ - BITBLK64_16(ip, 0, op, parm);\ - BITBLK64_16(ip, 1, op, parm);\ - BITBLK64_16(ip, 2, op, parm);\ - BITBLK64_16(ip, 3, op, parm);\ - BITBLK64_16(ip, 4, op, parm);\ - BITBLK64_16(ip, 5, op, parm);\ - BITBLK64_16(ip, 6, op, parm);\ - BITBLK64_16(ip, 7, op, parm); SRCI(ip); op += 16*4/sizeof(op[0]);\ -} - -#define BITBLK64_17(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*17+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*17+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 17;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*17+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 34 | (uint64_t)SRC1(ip, i*64+3) << 51;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*17+ 1) = (uint64_t)SRC(ip, i*64+ 3) >> 13;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 4;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 21;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*64+ 6) << 38 | (uint64_t)SRC1(ip, i*64+7) << 55;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*17+ 2) = (uint64_t)SRC(ip, i*64+ 7) >> 9;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 8;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 25;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*64+10) << 42 | (uint64_t)SRC1(ip, i*64+11) << 59;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*17+ 3) = (uint64_t)SRC(ip, i*64+11) >> 5;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+12) << 12;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+13) << 29;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*64+14) << 46 | (uint64_t)SRC1(ip, i*64+15) << 63;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*17+ 4) = (uint64_t)SRC(ip, i*64+15) >> 1;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*64+16) << 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*64+17) << 33 | (uint64_t)SRC1(ip, i*64+18) << 50;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*17+ 5) = (uint64_t)SRC(ip, i*64+18) >> 14;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+19) << 3;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+20) << 20;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*64+21) << 37 | (uint64_t)SRC1(ip, i*64+22) << 54;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*17+ 6) = (uint64_t)SRC(ip, i*64+22) >> 10;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+23) << 7;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+24) << 24;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*64+25) << 41 | (uint64_t)SRC1(ip, i*64+26) << 58;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*17+ 7) = (uint64_t)SRC(ip, i*64+26) >> 6;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+27) << 11;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+28) << 28;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*64+29) << 45 | (uint64_t)SRC1(ip, i*64+30) << 62;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*17+ 8) = (uint64_t)SRC(ip, i*64+30) >> 2;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*17+ 8) |= (uint64_t)SRC(ip, i*64+31) << 15;\ -} - -#define BITPACK64_17(ip, op, parm) { \ - BITBLK64_17(ip, 0, op, parm); SRCI(ip); op += 17*4/sizeof(op[0]);\ -} - -#define BITBLK64_18(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 18;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*9+ 0) |= (uint64_t)SRC(ip, i*32+ 2) << 36 | (uint64_t)SRC1(ip, i*32+3) << 54;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*9+ 1) = (uint64_t)SRC(ip, i*32+ 3) >> 10;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 4) << 8;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 5) << 26;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*32+ 6) << 44 | (uint64_t)SRC1(ip, i*32+7) << 62;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*9+ 2) = (uint64_t)SRC(ip, i*32+ 7) >> 2;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*32+ 8) << 16;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*32+ 9) << 34 | (uint64_t)SRC1(ip, i*32+10) << 52;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*9+ 3) = (uint64_t)SRC(ip, i*32+10) >> 12;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+11) << 6;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+12) << 24;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*32+13) << 42 | (uint64_t)SRC1(ip, i*32+14) << 60;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*9+ 4) = (uint64_t)SRC(ip, i*32+14) >> 4;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*32+15) << 14;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*9+ 4) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 50;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*9+ 5) = (uint64_t)SRC(ip, i*32+17) >> 14;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+18) << 4;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+19) << 22;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*32+20) << 40 | (uint64_t)SRC1(ip, i*32+21) << 58;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*9+ 6) = (uint64_t)SRC(ip, i*32+21) >> 6;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*9+ 6) |= (uint64_t)SRC(ip, i*32+22) << 12;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*9+ 6) |= (uint64_t)SRC(ip, i*32+23) << 30 | (uint64_t)SRC1(ip, i*32+24) << 48;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*9+ 7) = (uint64_t)SRC(ip, i*32+24) >> 16;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+25) << 2;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+26) << 20;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*32+27) << 38 | (uint64_t)SRC1(ip, i*32+28) << 56;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*9+ 8) = (uint64_t)SRC(ip, i*32+28) >> 8;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+29) << 10;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+30) << 28;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*32+31) << 46;\ -} - -#define BITPACK64_18(ip, op, parm) { \ - BITBLK64_18(ip, 0, op, parm); SRCI(ip); op += 18*4/sizeof(op[0]);\ -} - -#define BITBLK64_19(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*19+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*19+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 19;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*19+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 38 | (uint64_t)SRC1(ip, i*64+3) << 57;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*19+ 1) = (uint64_t)SRC(ip, i*64+ 3) >> 7;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*19+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 12;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*19+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 31 | (uint64_t)SRC1(ip, i*64+6) << 50;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*19+ 2) = (uint64_t)SRC(ip, i*64+ 6) >> 14;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 5;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 24;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*64+ 9) << 43 | (uint64_t)SRC1(ip, i*64+10) << 62;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*19+ 3) = (uint64_t)SRC(ip, i*64+10) >> 2;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*19+ 3) |= (uint64_t)SRC(ip, i*64+11) << 17;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*19+ 3) |= (uint64_t)SRC(ip, i*64+12) << 36 | (uint64_t)SRC1(ip, i*64+13) << 55;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*19+ 4) = (uint64_t)SRC(ip, i*64+13) >> 9;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*19+ 4) |= (uint64_t)SRC(ip, i*64+14) << 10;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*19+ 4) |= (uint64_t)SRC(ip, i*64+15) << 29 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*19+ 5) = (uint64_t)SRC(ip, i*64+16) >> 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+17) << 3;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+18) << 22;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*64+19) << 41 | (uint64_t)SRC1(ip, i*64+20) << 60;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*19+ 6) = (uint64_t)SRC(ip, i*64+20) >> 4;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*19+ 6) |= (uint64_t)SRC(ip, i*64+21) << 15;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*19+ 6) |= (uint64_t)SRC(ip, i*64+22) << 34 | (uint64_t)SRC1(ip, i*64+23) << 53;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*19+ 7) = (uint64_t)SRC(ip, i*64+23) >> 11;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*19+ 7) |= (uint64_t)SRC(ip, i*64+24) << 8;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*19+ 7) |= (uint64_t)SRC(ip, i*64+25) << 27 | (uint64_t)SRC1(ip, i*64+26) << 46;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*19+ 8) = (uint64_t)SRC(ip, i*64+26) >> 18;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+27) << 1;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+28) << 20;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*64+29) << 39 | (uint64_t)SRC1(ip, i*64+30) << 58;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*19+ 9) = (uint64_t)SRC(ip, i*64+30) >> 6;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*19+ 9) |= (uint64_t)SRC(ip, i*64+31) << 13;\ -} - -#define BITPACK64_19(ip, op, parm) { \ - BITBLK64_19(ip, 0, op, parm); SRCI(ip); op += 19*4/sizeof(op[0]);\ -} - -#define BITBLK64_20(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint64_t)SRC(ip, i*16+ 0) ;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 20;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*5+ 0) |= (uint64_t)SRC(ip, i*16+ 2) << 40 | (uint64_t)SRC1(ip, i*16+3) << 60;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*5+ 1) = (uint64_t)SRC(ip, i*16+ 3) >> 4;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*16+ 4) << 16;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*16+ 5) << 36 | (uint64_t)SRC1(ip, i*16+6) << 56;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*5+ 2) = (uint64_t)SRC(ip, i*16+ 6) >> 8;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*16+ 7) << 12;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*5+ 2) |= (uint64_t)SRC(ip, i*16+ 8) << 32 | (uint64_t)SRC1(ip, i*16+9) << 52;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*5+ 3) = (uint64_t)SRC(ip, i*16+ 9) >> 12;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*16+10) << 8;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*16+11) << 28 | (uint64_t)SRC1(ip, i*16+12) << 48;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*5+ 4) = (uint64_t)SRC(ip, i*16+12) >> 16;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+13) << 4;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+14) << 24;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*16+15) << 44;\ -} - -#define BITPACK64_20(ip, op, parm) { \ - BITBLK64_20(ip, 0, op, parm);\ - BITBLK64_20(ip, 1, op, parm); SRCI(ip); op += 20*4/sizeof(op[0]);\ -} - -#define BITBLK64_21(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*21+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*21+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 21;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*21+ 0) |= (uint64_t)SRC(ip, i*64+ 2) << 42 | (uint64_t)SRC1(ip, i*64+3) << 63;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*21+ 1) = (uint64_t)SRC(ip, i*64+ 3) >> 1;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 20;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*64+ 5) << 41 | (uint64_t)SRC1(ip, i*64+6) << 62;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*21+ 2) = (uint64_t)SRC(ip, i*64+ 6) >> 2;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*21+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 19;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*21+ 2) |= (uint64_t)SRC(ip, i*64+ 8) << 40 | (uint64_t)SRC1(ip, i*64+9) << 61;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*21+ 3) = (uint64_t)SRC(ip, i*64+ 9) >> 3;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*64+10) << 18;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*64+11) << 39 | (uint64_t)SRC1(ip, i*64+12) << 60;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*21+ 4) = (uint64_t)SRC(ip, i*64+12) >> 4;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*21+ 4) |= (uint64_t)SRC(ip, i*64+13) << 17;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*21+ 4) |= (uint64_t)SRC(ip, i*64+14) << 38 | (uint64_t)SRC1(ip, i*64+15) << 59;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*21+ 5) = (uint64_t)SRC(ip, i*64+15) >> 5;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*64+16) << 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*64+17) << 37 | (uint64_t)SRC1(ip, i*64+18) << 58;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*21+ 6) = (uint64_t)SRC(ip, i*64+18) >> 6;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*21+ 6) |= (uint64_t)SRC(ip, i*64+19) << 15;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*21+ 6) |= (uint64_t)SRC(ip, i*64+20) << 36 | (uint64_t)SRC1(ip, i*64+21) << 57;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*21+ 7) = (uint64_t)SRC(ip, i*64+21) >> 7;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*64+22) << 14;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*64+23) << 35 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*21+ 8) = (uint64_t)SRC(ip, i*64+24) >> 8;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*21+ 8) |= (uint64_t)SRC(ip, i*64+25) << 13;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*21+ 8) |= (uint64_t)SRC(ip, i*64+26) << 34 | (uint64_t)SRC1(ip, i*64+27) << 55;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*21+ 9) = (uint64_t)SRC(ip, i*64+27) >> 9;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*64+28) << 12;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*64+29) << 33 | (uint64_t)SRC1(ip, i*64+30) << 54;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*21+10) = (uint64_t)SRC(ip, i*64+30) >> 10;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*21+10) |= (uint64_t)SRC(ip, i*64+31) << 11;\ -} - -#define BITPACK64_21(ip, op, parm) { \ - BITBLK64_21(ip, 0, op, parm); SRCI(ip); op += 21*4/sizeof(op[0]);\ -} - -#define BITBLK64_22(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*11+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 22 | (uint64_t)SRC1(ip, i*32+2) << 44;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*11+ 1) = (uint64_t)SRC(ip, i*32+ 2) >> 20;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 2;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*11+ 1) |= (uint64_t)SRC(ip, i*32+ 4) << 24 | (uint64_t)SRC1(ip, i*32+5) << 46;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*11+ 2) = (uint64_t)SRC(ip, i*32+ 5) >> 18;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*32+ 6) << 4;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*32+ 7) << 26 | (uint64_t)SRC1(ip, i*32+8) << 48;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*11+ 3) = (uint64_t)SRC(ip, i*32+ 8) >> 16;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*32+ 9) << 6;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*11+ 3) |= (uint64_t)SRC(ip, i*32+10) << 28 | (uint64_t)SRC1(ip, i*32+11) << 50;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*11+ 4) = (uint64_t)SRC(ip, i*32+11) >> 14;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*32+12) << 8;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*32+13) << 30 | (uint64_t)SRC1(ip, i*32+14) << 52;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*11+ 5) = (uint64_t)SRC(ip, i*32+14) >> 12;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*11+ 5) |= (uint64_t)SRC(ip, i*32+15) << 10;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*11+ 5) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 54;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*11+ 6) = (uint64_t)SRC(ip, i*32+17) >> 10;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*32+18) << 12;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*32+19) << 34 | (uint64_t)SRC1(ip, i*32+20) << 56;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*11+ 7) = (uint64_t)SRC(ip, i*32+20) >> 8;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*11+ 7) |= (uint64_t)SRC(ip, i*32+21) << 14;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*11+ 7) |= (uint64_t)SRC(ip, i*32+22) << 36 | (uint64_t)SRC1(ip, i*32+23) << 58;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*11+ 8) = (uint64_t)SRC(ip, i*32+23) >> 6;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*32+24) << 16;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*32+25) << 38 | (uint64_t)SRC1(ip, i*32+26) << 60;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*11+ 9) = (uint64_t)SRC(ip, i*32+26) >> 4;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*11+ 9) |= (uint64_t)SRC(ip, i*32+27) << 18;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*11+ 9) |= (uint64_t)SRC(ip, i*32+28) << 40 | (uint64_t)SRC1(ip, i*32+29) << 62;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*11+10) = (uint64_t)SRC(ip, i*32+29) >> 2;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*32+30) << 20;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*32+31) << 42;\ -} - -#define BITPACK64_22(ip, op, parm) { \ - BITBLK64_22(ip, 0, op, parm); SRCI(ip); op += 22*4/sizeof(op[0]);\ -} - -#define BITBLK64_23(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*23+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*23+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 23 | (uint64_t)SRC1(ip, i*64+2) << 46;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*23+ 1) = (uint64_t)SRC(ip, i*64+ 2) >> 18;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*23+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 5;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*23+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 28 | (uint64_t)SRC1(ip, i*64+5) << 51;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*23+ 2) = (uint64_t)SRC(ip, i*64+ 5) >> 13;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 10;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*64+ 7) << 33 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*23+ 3) = (uint64_t)SRC(ip, i*64+ 8) >> 8;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*23+ 3) |= (uint64_t)SRC(ip, i*64+ 9) << 15;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*23+ 3) |= (uint64_t)SRC(ip, i*64+10) << 38 | (uint64_t)SRC1(ip, i*64+11) << 61;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*23+ 4) = (uint64_t)SRC(ip, i*64+11) >> 3;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*23+ 4) |= (uint64_t)SRC(ip, i*64+12) << 20 | (uint64_t)SRC1(ip, i*64+13) << 43;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*23+ 5) = (uint64_t)SRC(ip, i*64+13) >> 21;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*23+ 5) |= (uint64_t)SRC(ip, i*64+14) << 2;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*23+ 5) |= (uint64_t)SRC(ip, i*64+15) << 25 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*23+ 6) = (uint64_t)SRC(ip, i*64+16) >> 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*23+ 6) |= (uint64_t)SRC(ip, i*64+17) << 7;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*23+ 6) |= (uint64_t)SRC(ip, i*64+18) << 30 | (uint64_t)SRC1(ip, i*64+19) << 53;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*23+ 7) = (uint64_t)SRC(ip, i*64+19) >> 11;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*64+20) << 12;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*64+21) << 35 | (uint64_t)SRC1(ip, i*64+22) << 58;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*23+ 8) = (uint64_t)SRC(ip, i*64+22) >> 6;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*23+ 8) |= (uint64_t)SRC(ip, i*64+23) << 17;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*23+ 8) |= (uint64_t)SRC(ip, i*64+24) << 40 | (uint64_t)SRC1(ip, i*64+25) << 63;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*23+ 9) = (uint64_t)SRC(ip, i*64+25) >> 1;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*23+ 9) |= (uint64_t)SRC(ip, i*64+26) << 22 | (uint64_t)SRC1(ip, i*64+27) << 45;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*23+10) = (uint64_t)SRC(ip, i*64+27) >> 19;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*23+10) |= (uint64_t)SRC(ip, i*64+28) << 4;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*23+10) |= (uint64_t)SRC(ip, i*64+29) << 27 | (uint64_t)SRC1(ip, i*64+30) << 50;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*23+11) = (uint64_t)SRC(ip, i*64+30) >> 14;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*23+11) |= (uint64_t)SRC(ip, i*64+31) << 9;\ -} - -#define BITPACK64_23(ip, op, parm) { \ - BITBLK64_23(ip, 0, op, parm); SRCI(ip); op += 23*4/sizeof(op[0]);\ -} - -#define BITBLK64_24(ip, i, op, parm) { ;\ - IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint64_t)SRC(ip, i*8+ 0) ;\ - IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*3+ 0) |= (uint64_t)SRC(ip, i*8+ 1) << 24 | (uint64_t)SRC1(ip, i*8+2) << 48;\ - IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*3+ 1) = (uint64_t)SRC(ip, i*8+ 2) >> 16;\ - IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*8+ 3) << 8;\ - IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*3+ 1) |= (uint64_t)SRC(ip, i*8+ 4) << 32 | (uint64_t)SRC1(ip, i*8+5) << 56;\ - IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*3+ 2) = (uint64_t)SRC(ip, i*8+ 5) >> 8;\ - IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*8+ 6) << 16;\ - IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*8+ 7) << 40;\ -} - -#define BITPACK64_24(ip, op, parm) { \ - BITBLK64_24(ip, 0, op, parm);\ - BITBLK64_24(ip, 1, op, parm);\ - BITBLK64_24(ip, 2, op, parm);\ - BITBLK64_24(ip, 3, op, parm); SRCI(ip); op += 24*4/sizeof(op[0]);\ -} - -#define BITBLK64_25(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*25+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*25+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 25 | (uint64_t)SRC1(ip, i*64+2) << 50;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*25+ 1) = (uint64_t)SRC(ip, i*64+ 2) >> 14;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*25+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 11;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*25+ 1) |= (uint64_t)SRC(ip, i*64+ 4) << 36 | (uint64_t)SRC1(ip, i*64+5) << 61;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*25+ 2) = (uint64_t)SRC(ip, i*64+ 5) >> 3;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*25+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 22 | (uint64_t)SRC1(ip, i*64+7) << 47;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*25+ 3) = (uint64_t)SRC(ip, i*64+ 7) >> 17;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*64+ 8) << 8;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*64+ 9) << 33 | (uint64_t)SRC1(ip, i*64+10) << 58;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*25+ 4) = (uint64_t)SRC(ip, i*64+10) >> 6;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*25+ 4) |= (uint64_t)SRC(ip, i*64+11) << 19 | (uint64_t)SRC1(ip, i*64+12) << 44;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*25+ 5) = (uint64_t)SRC(ip, i*64+12) >> 20;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*25+ 5) |= (uint64_t)SRC(ip, i*64+13) << 5;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*25+ 5) |= (uint64_t)SRC(ip, i*64+14) << 30 | (uint64_t)SRC1(ip, i*64+15) << 55;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*25+ 6) = (uint64_t)SRC(ip, i*64+15) >> 9;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*25+ 6) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 41;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*25+ 7) = (uint64_t)SRC(ip, i*64+17) >> 23;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*25+ 7) |= (uint64_t)SRC(ip, i*64+18) << 2;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*25+ 7) |= (uint64_t)SRC(ip, i*64+19) << 27 | (uint64_t)SRC1(ip, i*64+20) << 52;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*25+ 8) = (uint64_t)SRC(ip, i*64+20) >> 12;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*25+ 8) |= (uint64_t)SRC(ip, i*64+21) << 13;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*25+ 8) |= (uint64_t)SRC(ip, i*64+22) << 38 | (uint64_t)SRC1(ip, i*64+23) << 63;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*25+ 9) = (uint64_t)SRC(ip, i*64+23) >> 1;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*25+ 9) |= (uint64_t)SRC(ip, i*64+24) << 24 | (uint64_t)SRC1(ip, i*64+25) << 49;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*25+10) = (uint64_t)SRC(ip, i*64+25) >> 15;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*64+26) << 10;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*64+27) << 35 | (uint64_t)SRC1(ip, i*64+28) << 60;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*25+11) = (uint64_t)SRC(ip, i*64+28) >> 4;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*25+11) |= (uint64_t)SRC(ip, i*64+29) << 21 | (uint64_t)SRC1(ip, i*64+30) << 46;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*25+12) = (uint64_t)SRC(ip, i*64+30) >> 18;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*25+12) |= (uint64_t)SRC(ip, i*64+31) << 7;\ -} - -#define BITPACK64_25(ip, op, parm) { \ - BITBLK64_25(ip, 0, op, parm); SRCI(ip); op += 25*4/sizeof(op[0]);\ -} - -#define BITBLK64_26(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*13+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 26 | (uint64_t)SRC1(ip, i*32+2) << 52;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*13+ 1) = (uint64_t)SRC(ip, i*32+ 2) >> 12;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*13+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 14 | (uint64_t)SRC1(ip, i*32+4) << 40;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*13+ 2) = (uint64_t)SRC(ip, i*32+ 4) >> 24;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*32+ 5) << 2;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*13+ 2) |= (uint64_t)SRC(ip, i*32+ 6) << 28 | (uint64_t)SRC1(ip, i*32+7) << 54;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*13+ 3) = (uint64_t)SRC(ip, i*32+ 7) >> 10;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*13+ 3) |= (uint64_t)SRC(ip, i*32+ 8) << 16 | (uint64_t)SRC1(ip, i*32+9) << 42;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*13+ 4) = (uint64_t)SRC(ip, i*32+ 9) >> 22;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*32+10) << 4;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*32+11) << 30 | (uint64_t)SRC1(ip, i*32+12) << 56;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*13+ 5) = (uint64_t)SRC(ip, i*32+12) >> 8;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*13+ 5) |= (uint64_t)SRC(ip, i*32+13) << 18 | (uint64_t)SRC1(ip, i*32+14) << 44;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*13+ 6) = (uint64_t)SRC(ip, i*32+14) >> 20;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*13+ 6) |= (uint64_t)SRC(ip, i*32+15) << 6;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*13+ 6) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 58;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*13+ 7) = (uint64_t)SRC(ip, i*32+17) >> 6;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*13+ 7) |= (uint64_t)SRC(ip, i*32+18) << 20 | (uint64_t)SRC1(ip, i*32+19) << 46;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*13+ 8) = (uint64_t)SRC(ip, i*32+19) >> 18;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*32+20) << 8;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*32+21) << 34 | (uint64_t)SRC1(ip, i*32+22) << 60;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*13+ 9) = (uint64_t)SRC(ip, i*32+22) >> 4;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*13+ 9) |= (uint64_t)SRC(ip, i*32+23) << 22 | (uint64_t)SRC1(ip, i*32+24) << 48;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*13+10) = (uint64_t)SRC(ip, i*32+24) >> 16;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*13+10) |= (uint64_t)SRC(ip, i*32+25) << 10;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*13+10) |= (uint64_t)SRC(ip, i*32+26) << 36 | (uint64_t)SRC1(ip, i*32+27) << 62;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*13+11) = (uint64_t)SRC(ip, i*32+27) >> 2;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*13+11) |= (uint64_t)SRC(ip, i*32+28) << 24 | (uint64_t)SRC1(ip, i*32+29) << 50;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*13+12) = (uint64_t)SRC(ip, i*32+29) >> 14;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*32+30) << 12;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*32+31) << 38;\ -} - -#define BITPACK64_26(ip, op, parm) { \ - BITBLK64_26(ip, 0, op, parm); SRCI(ip); op += 26*4/sizeof(op[0]);\ -} - -#define BITBLK64_27(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*27+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*27+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 27 | (uint64_t)SRC1(ip, i*64+2) << 54;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*27+ 1) = (uint64_t)SRC(ip, i*64+ 2) >> 10;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*27+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 17 | (uint64_t)SRC1(ip, i*64+4) << 44;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*27+ 2) = (uint64_t)SRC(ip, i*64+ 4) >> 20;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*27+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 7;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*27+ 2) |= (uint64_t)SRC(ip, i*64+ 6) << 34 | (uint64_t)SRC1(ip, i*64+7) << 61;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*27+ 3) = (uint64_t)SRC(ip, i*64+ 7) >> 3;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*27+ 3) |= (uint64_t)SRC(ip, i*64+ 8) << 24 | (uint64_t)SRC1(ip, i*64+9) << 51;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*27+ 4) = (uint64_t)SRC(ip, i*64+ 9) >> 13;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*27+ 4) |= (uint64_t)SRC(ip, i*64+10) << 14 | (uint64_t)SRC1(ip, i*64+11) << 41;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*27+ 5) = (uint64_t)SRC(ip, i*64+11) >> 23;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*27+ 5) |= (uint64_t)SRC(ip, i*64+12) << 4;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*27+ 5) |= (uint64_t)SRC(ip, i*64+13) << 31 | (uint64_t)SRC1(ip, i*64+14) << 58;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*27+ 6) = (uint64_t)SRC(ip, i*64+14) >> 6;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*27+ 6) |= (uint64_t)SRC(ip, i*64+15) << 21 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*27+ 7) = (uint64_t)SRC(ip, i*64+16) >> 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*27+ 7) |= (uint64_t)SRC(ip, i*64+17) << 11 | (uint64_t)SRC1(ip, i*64+18) << 38;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*27+ 8) = (uint64_t)SRC(ip, i*64+18) >> 26;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*27+ 8) |= (uint64_t)SRC(ip, i*64+19) << 1;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*27+ 8) |= (uint64_t)SRC(ip, i*64+20) << 28 | (uint64_t)SRC1(ip, i*64+21) << 55;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*27+ 9) = (uint64_t)SRC(ip, i*64+21) >> 9;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*27+ 9) |= (uint64_t)SRC(ip, i*64+22) << 18 | (uint64_t)SRC1(ip, i*64+23) << 45;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*27+10) = (uint64_t)SRC(ip, i*64+23) >> 19;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*64+24) << 8;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*64+25) << 35 | (uint64_t)SRC1(ip, i*64+26) << 62;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*27+11) = (uint64_t)SRC(ip, i*64+26) >> 2;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*27+11) |= (uint64_t)SRC(ip, i*64+27) << 25 | (uint64_t)SRC1(ip, i*64+28) << 52;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*27+12) = (uint64_t)SRC(ip, i*64+28) >> 12;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*27+12) |= (uint64_t)SRC(ip, i*64+29) << 15 | (uint64_t)SRC1(ip, i*64+30) << 42;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*27+13) = (uint64_t)SRC(ip, i*64+30) >> 22;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*27+13) |= (uint64_t)SRC(ip, i*64+31) << 5;\ -} - -#define BITPACK64_27(ip, op, parm) { \ - BITBLK64_27(ip, 0, op, parm); SRCI(ip); op += 27*4/sizeof(op[0]);\ -} - -#define BITBLK64_28(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint64_t)SRC(ip, i*16+ 0) ;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*7+ 0) |= (uint64_t)SRC(ip, i*16+ 1) << 28 | (uint64_t)SRC1(ip, i*16+2) << 56;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*7+ 1) = (uint64_t)SRC(ip, i*16+ 2) >> 8;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*7+ 1) |= (uint64_t)SRC(ip, i*16+ 3) << 20 | (uint64_t)SRC1(ip, i*16+4) << 48;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*7+ 2) = (uint64_t)SRC(ip, i*16+ 4) >> 16;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*7+ 2) |= (uint64_t)SRC(ip, i*16+ 5) << 12 | (uint64_t)SRC1(ip, i*16+6) << 40;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*7+ 3) = (uint64_t)SRC(ip, i*16+ 6) >> 24;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*16+ 7) << 4;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*7+ 3) |= (uint64_t)SRC(ip, i*16+ 8) << 32 | (uint64_t)SRC1(ip, i*16+9) << 60;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*7+ 4) = (uint64_t)SRC(ip, i*16+ 9) >> 4;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*7+ 4) |= (uint64_t)SRC(ip, i*16+10) << 24 | (uint64_t)SRC1(ip, i*16+11) << 52;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*7+ 5) = (uint64_t)SRC(ip, i*16+11) >> 12;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*7+ 5) |= (uint64_t)SRC(ip, i*16+12) << 16 | (uint64_t)SRC1(ip, i*16+13) << 44;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*7+ 6) = (uint64_t)SRC(ip, i*16+13) >> 20;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*16+14) << 8;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*16+15) << 36;\ -} - -#define BITPACK64_28(ip, op, parm) { \ - BITBLK64_28(ip, 0, op, parm);\ - BITBLK64_28(ip, 1, op, parm); SRCI(ip); op += 28*4/sizeof(op[0]);\ -} - -#define BITBLK64_29(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*29+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*29+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 29 | (uint64_t)SRC1(ip, i*64+2) << 58;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*29+ 1) = (uint64_t)SRC(ip, i*64+ 2) >> 6;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*29+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 23 | (uint64_t)SRC1(ip, i*64+4) << 52;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*29+ 2) = (uint64_t)SRC(ip, i*64+ 4) >> 12;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*29+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 17 | (uint64_t)SRC1(ip, i*64+6) << 46;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*29+ 3) = (uint64_t)SRC(ip, i*64+ 6) >> 18;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*29+ 3) |= (uint64_t)SRC(ip, i*64+ 7) << 11 | (uint64_t)SRC1(ip, i*64+8) << 40;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*29+ 4) = (uint64_t)SRC(ip, i*64+ 8) >> 24;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*29+ 4) |= (uint64_t)SRC(ip, i*64+ 9) << 5;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*29+ 4) |= (uint64_t)SRC(ip, i*64+10) << 34 | (uint64_t)SRC1(ip, i*64+11) << 63;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*29+ 5) = (uint64_t)SRC(ip, i*64+11) >> 1;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*29+ 5) |= (uint64_t)SRC(ip, i*64+12) << 28 | (uint64_t)SRC1(ip, i*64+13) << 57;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*29+ 6) = (uint64_t)SRC(ip, i*64+13) >> 7;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*29+ 6) |= (uint64_t)SRC(ip, i*64+14) << 22 | (uint64_t)SRC1(ip, i*64+15) << 51;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*29+ 7) = (uint64_t)SRC(ip, i*64+15) >> 13;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*29+ 7) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 45;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*29+ 8) = (uint64_t)SRC(ip, i*64+17) >> 19;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*29+ 8) |= (uint64_t)SRC(ip, i*64+18) << 10 | (uint64_t)SRC1(ip, i*64+19) << 39;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*29+ 9) = (uint64_t)SRC(ip, i*64+19) >> 25;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*64+20) << 4;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*64+21) << 33 | (uint64_t)SRC1(ip, i*64+22) << 62;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*29+10) = (uint64_t)SRC(ip, i*64+22) >> 2;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*29+10) |= (uint64_t)SRC(ip, i*64+23) << 27 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*29+11) = (uint64_t)SRC(ip, i*64+24) >> 8;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*29+11) |= (uint64_t)SRC(ip, i*64+25) << 21 | (uint64_t)SRC1(ip, i*64+26) << 50;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*29+12) = (uint64_t)SRC(ip, i*64+26) >> 14;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*29+12) |= (uint64_t)SRC(ip, i*64+27) << 15 | (uint64_t)SRC1(ip, i*64+28) << 44;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*29+13) = (uint64_t)SRC(ip, i*64+28) >> 20;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*29+13) |= (uint64_t)SRC(ip, i*64+29) << 9 | (uint64_t)SRC1(ip, i*64+30) << 38;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*29+14) = (uint64_t)SRC(ip, i*64+30) >> 26;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*29+14) |= (uint64_t)SRC(ip, i*64+31) << 3;\ -} - -#define BITPACK64_29(ip, op, parm) { \ - BITBLK64_29(ip, 0, op, parm); SRCI(ip); op += 29*4/sizeof(op[0]);\ -} - -#define BITBLK64_30(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint64_t)SRC(ip, i*32+ 0) ;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*15+ 0) |= (uint64_t)SRC(ip, i*32+ 1) << 30 | (uint64_t)SRC1(ip, i*32+2) << 60;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*15+ 1) = (uint64_t)SRC(ip, i*32+ 2) >> 4;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*15+ 1) |= (uint64_t)SRC(ip, i*32+ 3) << 26 | (uint64_t)SRC1(ip, i*32+4) << 56;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*15+ 2) = (uint64_t)SRC(ip, i*32+ 4) >> 8;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*15+ 2) |= (uint64_t)SRC(ip, i*32+ 5) << 22 | (uint64_t)SRC1(ip, i*32+6) << 52;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*15+ 3) = (uint64_t)SRC(ip, i*32+ 6) >> 12;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*15+ 3) |= (uint64_t)SRC(ip, i*32+ 7) << 18 | (uint64_t)SRC1(ip, i*32+8) << 48;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*15+ 4) = (uint64_t)SRC(ip, i*32+ 8) >> 16;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*15+ 4) |= (uint64_t)SRC(ip, i*32+ 9) << 14 | (uint64_t)SRC1(ip, i*32+10) << 44;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*15+ 5) = (uint64_t)SRC(ip, i*32+10) >> 20;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*15+ 5) |= (uint64_t)SRC(ip, i*32+11) << 10 | (uint64_t)SRC1(ip, i*32+12) << 40;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*15+ 6) = (uint64_t)SRC(ip, i*32+12) >> 24;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*15+ 6) |= (uint64_t)SRC(ip, i*32+13) << 6 | (uint64_t)SRC1(ip, i*32+14) << 36;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*15+ 7) = (uint64_t)SRC(ip, i*32+14) >> 28;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*15+ 7) |= (uint64_t)SRC(ip, i*32+15) << 2;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*15+ 7) |= (uint64_t)SRC(ip, i*32+16) << 32 | (uint64_t)SRC1(ip, i*32+17) << 62;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*15+ 8) = (uint64_t)SRC(ip, i*32+17) >> 2;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*15+ 8) |= (uint64_t)SRC(ip, i*32+18) << 28 | (uint64_t)SRC1(ip, i*32+19) << 58;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*15+ 9) = (uint64_t)SRC(ip, i*32+19) >> 6;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*15+ 9) |= (uint64_t)SRC(ip, i*32+20) << 24 | (uint64_t)SRC1(ip, i*32+21) << 54;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*15+10) = (uint64_t)SRC(ip, i*32+21) >> 10;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*15+10) |= (uint64_t)SRC(ip, i*32+22) << 20 | (uint64_t)SRC1(ip, i*32+23) << 50;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*15+11) = (uint64_t)SRC(ip, i*32+23) >> 14;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*15+11) |= (uint64_t)SRC(ip, i*32+24) << 16 | (uint64_t)SRC1(ip, i*32+25) << 46;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*15+12) = (uint64_t)SRC(ip, i*32+25) >> 18;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*15+12) |= (uint64_t)SRC(ip, i*32+26) << 12 | (uint64_t)SRC1(ip, i*32+27) << 42;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*15+13) = (uint64_t)SRC(ip, i*32+27) >> 22;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*15+13) |= (uint64_t)SRC(ip, i*32+28) << 8 | (uint64_t)SRC1(ip, i*32+29) << 38;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*15+14) = (uint64_t)SRC(ip, i*32+29) >> 26;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*32+30) << 4;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*32+31) << 34;\ -} - -#define BITPACK64_30(ip, op, parm) { \ - BITBLK64_30(ip, 0, op, parm); SRCI(ip); op += 30*4/sizeof(op[0]);\ -} - -#define BITBLK64_31(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*31+ 0) = (uint64_t)SRC(ip, i*64+ 0) ;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*31+ 0) |= (uint64_t)SRC(ip, i*64+ 1) << 31 | (uint64_t)SRC1(ip, i*64+2) << 62;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*31+ 1) = (uint64_t)SRC(ip, i*64+ 2) >> 2;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*31+ 1) |= (uint64_t)SRC(ip, i*64+ 3) << 29 | (uint64_t)SRC1(ip, i*64+4) << 60;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*31+ 2) = (uint64_t)SRC(ip, i*64+ 4) >> 4;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*31+ 2) |= (uint64_t)SRC(ip, i*64+ 5) << 27 | (uint64_t)SRC1(ip, i*64+6) << 58;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*31+ 3) = (uint64_t)SRC(ip, i*64+ 6) >> 6;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*31+ 3) |= (uint64_t)SRC(ip, i*64+ 7) << 25 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*31+ 4) = (uint64_t)SRC(ip, i*64+ 8) >> 8;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*31+ 4) |= (uint64_t)SRC(ip, i*64+ 9) << 23 | (uint64_t)SRC1(ip, i*64+10) << 54;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*31+ 5) = (uint64_t)SRC(ip, i*64+10) >> 10;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*31+ 5) |= (uint64_t)SRC(ip, i*64+11) << 21 | (uint64_t)SRC1(ip, i*64+12) << 52;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*31+ 6) = (uint64_t)SRC(ip, i*64+12) >> 12;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*31+ 6) |= (uint64_t)SRC(ip, i*64+13) << 19 | (uint64_t)SRC1(ip, i*64+14) << 50;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*31+ 7) = (uint64_t)SRC(ip, i*64+14) >> 14;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*31+ 7) |= (uint64_t)SRC(ip, i*64+15) << 17 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*31+ 8) = (uint64_t)SRC(ip, i*64+16) >> 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*31+ 8) |= (uint64_t)SRC(ip, i*64+17) << 15 | (uint64_t)SRC1(ip, i*64+18) << 46;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*31+ 9) = (uint64_t)SRC(ip, i*64+18) >> 18;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*31+ 9) |= (uint64_t)SRC(ip, i*64+19) << 13 | (uint64_t)SRC1(ip, i*64+20) << 44;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*31+10) = (uint64_t)SRC(ip, i*64+20) >> 20;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*31+10) |= (uint64_t)SRC(ip, i*64+21) << 11 | (uint64_t)SRC1(ip, i*64+22) << 42;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*31+11) = (uint64_t)SRC(ip, i*64+22) >> 22;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*31+11) |= (uint64_t)SRC(ip, i*64+23) << 9 | (uint64_t)SRC1(ip, i*64+24) << 40;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*31+12) = (uint64_t)SRC(ip, i*64+24) >> 24;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*31+12) |= (uint64_t)SRC(ip, i*64+25) << 7 | (uint64_t)SRC1(ip, i*64+26) << 38;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*31+13) = (uint64_t)SRC(ip, i*64+26) >> 26;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*31+13) |= (uint64_t)SRC(ip, i*64+27) << 5 | (uint64_t)SRC1(ip, i*64+28) << 36;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*31+14) = (uint64_t)SRC(ip, i*64+28) >> 28;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*31+14) |= (uint64_t)SRC(ip, i*64+29) << 3 | (uint64_t)SRC1(ip, i*64+30) << 34;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*31+15) = (uint64_t)SRC(ip, i*64+30) >> 30;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*31+15) |= (uint64_t)SRC(ip, i*64+31) << 1;\ -} - -#define BITPACK64_31(ip, op, parm) { \ - BITBLK64_31(ip, 0, op, parm); SRCI(ip); op += 31*4/sizeof(op[0]);\ -} - -#define BITBLK64_32(ip, i, op, parm) { \ - IPPB(ip, i*2+ 0, parm); *(uint32_t *)(op+i*8+ 0) = SRC(ip, i*2+ 0);\ - IPPB(ip, i*2+ 1, parm); *(uint32_t *)(op+i*8+ 4) = SRC(ip, i*2+ 1);;\ -} - -#define BITPACK64_32(ip, op, parm) { \ - BITBLK64_32(ip, 0, op, parm);\ - BITBLK64_32(ip, 1, op, parm);\ - BITBLK64_32(ip, 2, op, parm);\ - BITBLK64_32(ip, 3, op, parm);\ - BITBLK64_32(ip, 4, op, parm);\ - BITBLK64_32(ip, 5, op, parm);\ - BITBLK64_32(ip, 6, op, parm);\ - BITBLK64_32(ip, 7, op, parm);\ - BITBLK64_32(ip, 8, op, parm);\ - BITBLK64_32(ip, 9, op, parm);\ - BITBLK64_32(ip, 10, op, parm);\ - BITBLK64_32(ip, 11, op, parm);\ - BITBLK64_32(ip, 12, op, parm);\ - BITBLK64_32(ip, 13, op, parm);\ - BITBLK64_32(ip, 14, op, parm);\ - BITBLK64_32(ip, 15, op, parm); SRCI(ip); op += 32*4/sizeof(op[0]);\ -} - -#define BITBLK64_33(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*33+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 33;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*33+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 31;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*33+ 1) |= (uint64_t)SRC(ip, i*64+ 2) << 2 | (uint64_t)SRC1(ip, i*64+3) << 35;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*33+ 2) = (uint64_t)SRC(ip, i*64+ 3) >> 29;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*33+ 2) |= (uint64_t)SRC(ip, i*64+ 4) << 4 | (uint64_t)SRC1(ip, i*64+5) << 37;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*33+ 3) = (uint64_t)SRC(ip, i*64+ 5) >> 27;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*33+ 3) |= (uint64_t)SRC(ip, i*64+ 6) << 6 | (uint64_t)SRC1(ip, i*64+7) << 39;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*33+ 4) = (uint64_t)SRC(ip, i*64+ 7) >> 25;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*33+ 4) |= (uint64_t)SRC(ip, i*64+ 8) << 8 | (uint64_t)SRC1(ip, i*64+9) << 41;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*33+ 5) = (uint64_t)SRC(ip, i*64+ 9) >> 23;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*33+ 5) |= (uint64_t)SRC(ip, i*64+10) << 10 | (uint64_t)SRC1(ip, i*64+11) << 43;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*33+ 6) = (uint64_t)SRC(ip, i*64+11) >> 21;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*33+ 6) |= (uint64_t)SRC(ip, i*64+12) << 12 | (uint64_t)SRC1(ip, i*64+13) << 45;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*33+ 7) = (uint64_t)SRC(ip, i*64+13) >> 19;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*33+ 7) |= (uint64_t)SRC(ip, i*64+14) << 14 | (uint64_t)SRC1(ip, i*64+15) << 47;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*33+ 8) = (uint64_t)SRC(ip, i*64+15) >> 17;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*33+ 8) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 49;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*33+ 9) = (uint64_t)SRC(ip, i*64+17) >> 15;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*33+ 9) |= (uint64_t)SRC(ip, i*64+18) << 18 | (uint64_t)SRC1(ip, i*64+19) << 51;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*33+10) = (uint64_t)SRC(ip, i*64+19) >> 13;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*33+10) |= (uint64_t)SRC(ip, i*64+20) << 20 | (uint64_t)SRC1(ip, i*64+21) << 53;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*33+11) = (uint64_t)SRC(ip, i*64+21) >> 11;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*33+11) |= (uint64_t)SRC(ip, i*64+22) << 22 | (uint64_t)SRC1(ip, i*64+23) << 55;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*33+12) = (uint64_t)SRC(ip, i*64+23) >> 9;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*33+12) |= (uint64_t)SRC(ip, i*64+24) << 24 | (uint64_t)SRC1(ip, i*64+25) << 57;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*33+13) = (uint64_t)SRC(ip, i*64+25) >> 7;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*33+13) |= (uint64_t)SRC(ip, i*64+26) << 26 | (uint64_t)SRC1(ip, i*64+27) << 59;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*33+14) = (uint64_t)SRC(ip, i*64+27) >> 5;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*33+14) |= (uint64_t)SRC(ip, i*64+28) << 28 | (uint64_t)SRC1(ip, i*64+29) << 61;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*33+15) = (uint64_t)SRC(ip, i*64+29) >> 3;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*33+15) |= (uint64_t)SRC(ip, i*64+30) << 30 | (uint64_t)SRC1(ip, i*64+31) << 63;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*33+16) = (uint64_t)SRC(ip, i*64+31) >> 1;\ -} - -#define BITPACK64_33(ip, op, parm) { \ - BITBLK64_33(ip, 0, op, parm); SRCI(ip); op += 33*4/sizeof(op[0]);\ -} - -#define BITBLK64_34(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*17+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 34;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*17+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 30;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*17+ 1) |= (uint64_t)SRC(ip, i*32+ 2) << 4 | (uint64_t)SRC1(ip, i*32+3) << 38;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*17+ 2) = (uint64_t)SRC(ip, i*32+ 3) >> 26;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*17+ 2) |= (uint64_t)SRC(ip, i*32+ 4) << 8 | (uint64_t)SRC1(ip, i*32+5) << 42;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*17+ 3) = (uint64_t)SRC(ip, i*32+ 5) >> 22;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*17+ 3) |= (uint64_t)SRC(ip, i*32+ 6) << 12 | (uint64_t)SRC1(ip, i*32+7) << 46;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*17+ 4) = (uint64_t)SRC(ip, i*32+ 7) >> 18;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*17+ 4) |= (uint64_t)SRC(ip, i*32+ 8) << 16 | (uint64_t)SRC1(ip, i*32+9) << 50;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*17+ 5) = (uint64_t)SRC(ip, i*32+ 9) >> 14;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*17+ 5) |= (uint64_t)SRC(ip, i*32+10) << 20 | (uint64_t)SRC1(ip, i*32+11) << 54;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*17+ 6) = (uint64_t)SRC(ip, i*32+11) >> 10;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*17+ 6) |= (uint64_t)SRC(ip, i*32+12) << 24 | (uint64_t)SRC1(ip, i*32+13) << 58;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*17+ 7) = (uint64_t)SRC(ip, i*32+13) >> 6;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*17+ 7) |= (uint64_t)SRC(ip, i*32+14) << 28 | (uint64_t)SRC1(ip, i*32+15) << 62;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*17+ 8) = (uint64_t)SRC(ip, i*32+15) >> 2 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*17+ 9) = (uint64_t)SRC(ip, i*32+16) >> 32;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*17+ 9) |= (uint64_t)SRC(ip, i*32+17) << 2 | (uint64_t)SRC1(ip, i*32+18) << 36;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*17+10) = (uint64_t)SRC(ip, i*32+18) >> 28;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*17+10) |= (uint64_t)SRC(ip, i*32+19) << 6 | (uint64_t)SRC1(ip, i*32+20) << 40;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*17+11) = (uint64_t)SRC(ip, i*32+20) >> 24;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*17+11) |= (uint64_t)SRC(ip, i*32+21) << 10 | (uint64_t)SRC1(ip, i*32+22) << 44;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*17+12) = (uint64_t)SRC(ip, i*32+22) >> 20;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*17+12) |= (uint64_t)SRC(ip, i*32+23) << 14 | (uint64_t)SRC1(ip, i*32+24) << 48;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*17+13) = (uint64_t)SRC(ip, i*32+24) >> 16;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*17+13) |= (uint64_t)SRC(ip, i*32+25) << 18 | (uint64_t)SRC1(ip, i*32+26) << 52;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*17+14) = (uint64_t)SRC(ip, i*32+26) >> 12;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*17+14) |= (uint64_t)SRC(ip, i*32+27) << 22 | (uint64_t)SRC1(ip, i*32+28) << 56;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*17+15) = (uint64_t)SRC(ip, i*32+28) >> 8;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*17+15) |= (uint64_t)SRC(ip, i*32+29) << 26 | (uint64_t)SRC1(ip, i*32+30) << 60;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*17+16) = (uint64_t)SRC(ip, i*32+30) >> 4;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*17+16) |= (uint64_t)SRC(ip, i*32+31) << 30;\ -} - -#define BITPACK64_34(ip, op, parm) { \ - BITBLK64_34(ip, 0, op, parm); SRCI(ip); op += 34*4/sizeof(op[0]);\ -} - -#define BITBLK64_35(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*35+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 35;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*35+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 29;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*35+ 1) |= (uint64_t)SRC(ip, i*64+ 2) << 6 | (uint64_t)SRC1(ip, i*64+3) << 41;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*35+ 2) = (uint64_t)SRC(ip, i*64+ 3) >> 23;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*35+ 2) |= (uint64_t)SRC(ip, i*64+ 4) << 12 | (uint64_t)SRC1(ip, i*64+5) << 47;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*35+ 3) = (uint64_t)SRC(ip, i*64+ 5) >> 17;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*35+ 3) |= (uint64_t)SRC(ip, i*64+ 6) << 18 | (uint64_t)SRC1(ip, i*64+7) << 53;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*35+ 4) = (uint64_t)SRC(ip, i*64+ 7) >> 11;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*35+ 4) |= (uint64_t)SRC(ip, i*64+ 8) << 24 | (uint64_t)SRC1(ip, i*64+9) << 59;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*35+ 5) = (uint64_t)SRC(ip, i*64+ 9) >> 5 | (uint64_t)SRC1(ip, i*64+10) << 30;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*35+ 6) = (uint64_t)SRC(ip, i*64+10) >> 34;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*35+ 6) |= (uint64_t)SRC(ip, i*64+11) << 1 | (uint64_t)SRC1(ip, i*64+12) << 36;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*35+ 7) = (uint64_t)SRC(ip, i*64+12) >> 28;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*35+ 7) |= (uint64_t)SRC(ip, i*64+13) << 7 | (uint64_t)SRC1(ip, i*64+14) << 42;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*35+ 8) = (uint64_t)SRC(ip, i*64+14) >> 22;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*35+ 8) |= (uint64_t)SRC(ip, i*64+15) << 13 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*35+ 9) = (uint64_t)SRC(ip, i*64+16) >> 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*35+ 9) |= (uint64_t)SRC(ip, i*64+17) << 19 | (uint64_t)SRC1(ip, i*64+18) << 54;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*35+10) = (uint64_t)SRC(ip, i*64+18) >> 10;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*35+10) |= (uint64_t)SRC(ip, i*64+19) << 25 | (uint64_t)SRC1(ip, i*64+20) << 60;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*35+11) = (uint64_t)SRC(ip, i*64+20) >> 4 | (uint64_t)SRC1(ip, i*64+21) << 31;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*35+12) = (uint64_t)SRC(ip, i*64+21) >> 33;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*35+12) |= (uint64_t)SRC(ip, i*64+22) << 2 | (uint64_t)SRC1(ip, i*64+23) << 37;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*35+13) = (uint64_t)SRC(ip, i*64+23) >> 27;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*35+13) |= (uint64_t)SRC(ip, i*64+24) << 8 | (uint64_t)SRC1(ip, i*64+25) << 43;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*35+14) = (uint64_t)SRC(ip, i*64+25) >> 21;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*35+14) |= (uint64_t)SRC(ip, i*64+26) << 14 | (uint64_t)SRC1(ip, i*64+27) << 49;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*35+15) = (uint64_t)SRC(ip, i*64+27) >> 15;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*35+15) |= (uint64_t)SRC(ip, i*64+28) << 20 | (uint64_t)SRC1(ip, i*64+29) << 55;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*35+16) = (uint64_t)SRC(ip, i*64+29) >> 9;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*35+16) |= (uint64_t)SRC(ip, i*64+30) << 26 | (uint64_t)SRC1(ip, i*64+31) << 61;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*35+17) = (uint64_t)SRC(ip, i*64+31) >> 3;\ -} - -#define BITPACK64_35(ip, op, parm) { \ - BITBLK64_35(ip, 0, op, parm); SRCI(ip); op += 35*4/sizeof(op[0]);\ -} - -#define BITBLK64_36(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*9+ 0) = (uint64_t)SRC(ip, i*16+ 0) | (uint64_t)SRC1(ip, i*16+1) << 36;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*9+ 1) = (uint64_t)SRC(ip, i*16+ 1) >> 28;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*9+ 1) |= (uint64_t)SRC(ip, i*16+ 2) << 8 | (uint64_t)SRC1(ip, i*16+3) << 44;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*9+ 2) = (uint64_t)SRC(ip, i*16+ 3) >> 20;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*9+ 2) |= (uint64_t)SRC(ip, i*16+ 4) << 16 | (uint64_t)SRC1(ip, i*16+5) << 52;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*9+ 3) = (uint64_t)SRC(ip, i*16+ 5) >> 12;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*9+ 3) |= (uint64_t)SRC(ip, i*16+ 6) << 24 | (uint64_t)SRC1(ip, i*16+7) << 60;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*9+ 4) = (uint64_t)SRC(ip, i*16+ 7) >> 4 | (uint64_t)SRC1(ip, i*16+8) << 32;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*9+ 5) = (uint64_t)SRC(ip, i*16+ 8) >> 32;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*9+ 5) |= (uint64_t)SRC(ip, i*16+ 9) << 4 | (uint64_t)SRC1(ip, i*16+10) << 40;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*9+ 6) = (uint64_t)SRC(ip, i*16+10) >> 24;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*9+ 6) |= (uint64_t)SRC(ip, i*16+11) << 12 | (uint64_t)SRC1(ip, i*16+12) << 48;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*9+ 7) = (uint64_t)SRC(ip, i*16+12) >> 16;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*9+ 7) |= (uint64_t)SRC(ip, i*16+13) << 20 | (uint64_t)SRC1(ip, i*16+14) << 56;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*9+ 8) = (uint64_t)SRC(ip, i*16+14) >> 8;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*9+ 8) |= (uint64_t)SRC(ip, i*16+15) << 28;\ -} - -#define BITPACK64_36(ip, op, parm) { \ - BITBLK64_36(ip, 0, op, parm);\ - BITBLK64_36(ip, 1, op, parm); SRCI(ip); op += 36*4/sizeof(op[0]);\ -} - -#define BITBLK64_37(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*37+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 37;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*37+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 27;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*37+ 1) |= (uint64_t)SRC(ip, i*64+ 2) << 10 | (uint64_t)SRC1(ip, i*64+3) << 47;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*37+ 2) = (uint64_t)SRC(ip, i*64+ 3) >> 17;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*37+ 2) |= (uint64_t)SRC(ip, i*64+ 4) << 20 | (uint64_t)SRC1(ip, i*64+5) << 57;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*37+ 3) = (uint64_t)SRC(ip, i*64+ 5) >> 7 | (uint64_t)SRC1(ip, i*64+6) << 30;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*37+ 4) = (uint64_t)SRC(ip, i*64+ 6) >> 34;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*37+ 4) |= (uint64_t)SRC(ip, i*64+ 7) << 3 | (uint64_t)SRC1(ip, i*64+8) << 40;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*37+ 5) = (uint64_t)SRC(ip, i*64+ 8) >> 24;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*37+ 5) |= (uint64_t)SRC(ip, i*64+ 9) << 13 | (uint64_t)SRC1(ip, i*64+10) << 50;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*37+ 6) = (uint64_t)SRC(ip, i*64+10) >> 14;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*37+ 6) |= (uint64_t)SRC(ip, i*64+11) << 23 | (uint64_t)SRC1(ip, i*64+12) << 60;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*37+ 7) = (uint64_t)SRC(ip, i*64+12) >> 4 | (uint64_t)SRC1(ip, i*64+13) << 33;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*37+ 8) = (uint64_t)SRC(ip, i*64+13) >> 31;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*37+ 8) |= (uint64_t)SRC(ip, i*64+14) << 6 | (uint64_t)SRC1(ip, i*64+15) << 43;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*37+ 9) = (uint64_t)SRC(ip, i*64+15) >> 21;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*37+ 9) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 53;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*37+10) = (uint64_t)SRC(ip, i*64+17) >> 11;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*37+10) |= (uint64_t)SRC(ip, i*64+18) << 26 | (uint64_t)SRC1(ip, i*64+19) << 63;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*37+11) = (uint64_t)SRC(ip, i*64+19) >> 1 | (uint64_t)SRC1(ip, i*64+20) << 36;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*37+12) = (uint64_t)SRC(ip, i*64+20) >> 28;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*37+12) |= (uint64_t)SRC(ip, i*64+21) << 9 | (uint64_t)SRC1(ip, i*64+22) << 46;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*37+13) = (uint64_t)SRC(ip, i*64+22) >> 18;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*37+13) |= (uint64_t)SRC(ip, i*64+23) << 19 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*37+14) = (uint64_t)SRC(ip, i*64+24) >> 8 | (uint64_t)SRC1(ip, i*64+25) << 29;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*37+15) = (uint64_t)SRC(ip, i*64+25) >> 35;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*37+15) |= (uint64_t)SRC(ip, i*64+26) << 2 | (uint64_t)SRC1(ip, i*64+27) << 39;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*37+16) = (uint64_t)SRC(ip, i*64+27) >> 25;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*37+16) |= (uint64_t)SRC(ip, i*64+28) << 12 | (uint64_t)SRC1(ip, i*64+29) << 49;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*37+17) = (uint64_t)SRC(ip, i*64+29) >> 15;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*37+17) |= (uint64_t)SRC(ip, i*64+30) << 22 | (uint64_t)SRC1(ip, i*64+31) << 59;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*37+18) = (uint64_t)SRC(ip, i*64+31) >> 5;\ -} - -#define BITPACK64_37(ip, op, parm) { \ - BITBLK64_37(ip, 0, op, parm); SRCI(ip); op += 37*4/sizeof(op[0]);\ -} - -#define BITBLK64_38(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*19+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 38;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*19+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 26;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*19+ 1) |= (uint64_t)SRC(ip, i*32+ 2) << 12 | (uint64_t)SRC1(ip, i*32+3) << 50;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*19+ 2) = (uint64_t)SRC(ip, i*32+ 3) >> 14;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*19+ 2) |= (uint64_t)SRC(ip, i*32+ 4) << 24 | (uint64_t)SRC1(ip, i*32+5) << 62;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*19+ 3) = (uint64_t)SRC(ip, i*32+ 5) >> 2 | (uint64_t)SRC1(ip, i*32+6) << 36;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*19+ 4) = (uint64_t)SRC(ip, i*32+ 6) >> 28;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*19+ 4) |= (uint64_t)SRC(ip, i*32+ 7) << 10 | (uint64_t)SRC1(ip, i*32+8) << 48;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*19+ 5) = (uint64_t)SRC(ip, i*32+ 8) >> 16;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*19+ 5) |= (uint64_t)SRC(ip, i*32+ 9) << 22 | (uint64_t)SRC1(ip, i*32+10) << 60;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*19+ 6) = (uint64_t)SRC(ip, i*32+10) >> 4 | (uint64_t)SRC1(ip, i*32+11) << 34;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*19+ 7) = (uint64_t)SRC(ip, i*32+11) >> 30;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*19+ 7) |= (uint64_t)SRC(ip, i*32+12) << 8 | (uint64_t)SRC1(ip, i*32+13) << 46;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*19+ 8) = (uint64_t)SRC(ip, i*32+13) >> 18;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*19+ 8) |= (uint64_t)SRC(ip, i*32+14) << 20 | (uint64_t)SRC1(ip, i*32+15) << 58;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*19+ 9) = (uint64_t)SRC(ip, i*32+15) >> 6 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*19+10) = (uint64_t)SRC(ip, i*32+16) >> 32;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*19+10) |= (uint64_t)SRC(ip, i*32+17) << 6 | (uint64_t)SRC1(ip, i*32+18) << 44;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*19+11) = (uint64_t)SRC(ip, i*32+18) >> 20;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*19+11) |= (uint64_t)SRC(ip, i*32+19) << 18 | (uint64_t)SRC1(ip, i*32+20) << 56;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*19+12) = (uint64_t)SRC(ip, i*32+20) >> 8 | (uint64_t)SRC1(ip, i*32+21) << 30;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*19+13) = (uint64_t)SRC(ip, i*32+21) >> 34;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*19+13) |= (uint64_t)SRC(ip, i*32+22) << 4 | (uint64_t)SRC1(ip, i*32+23) << 42;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*19+14) = (uint64_t)SRC(ip, i*32+23) >> 22;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*19+14) |= (uint64_t)SRC(ip, i*32+24) << 16 | (uint64_t)SRC1(ip, i*32+25) << 54;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*19+15) = (uint64_t)SRC(ip, i*32+25) >> 10 | (uint64_t)SRC1(ip, i*32+26) << 28;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*19+16) = (uint64_t)SRC(ip, i*32+26) >> 36;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*19+16) |= (uint64_t)SRC(ip, i*32+27) << 2 | (uint64_t)SRC1(ip, i*32+28) << 40;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*19+17) = (uint64_t)SRC(ip, i*32+28) >> 24;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*19+17) |= (uint64_t)SRC(ip, i*32+29) << 14 | (uint64_t)SRC1(ip, i*32+30) << 52;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*19+18) = (uint64_t)SRC(ip, i*32+30) >> 12;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*19+18) |= (uint64_t)SRC(ip, i*32+31) << 26;\ -} - -#define BITPACK64_38(ip, op, parm) { \ - BITBLK64_38(ip, 0, op, parm); SRCI(ip); op += 38*4/sizeof(op[0]);\ -} - -#define BITBLK64_39(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*39+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 39;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*39+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 25;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*39+ 1) |= (uint64_t)SRC(ip, i*64+ 2) << 14 | (uint64_t)SRC1(ip, i*64+3) << 53;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*39+ 2) = (uint64_t)SRC(ip, i*64+ 3) >> 11 | (uint64_t)SRC1(ip, i*64+4) << 28;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*39+ 3) = (uint64_t)SRC(ip, i*64+ 4) >> 36;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*39+ 3) |= (uint64_t)SRC(ip, i*64+ 5) << 3 | (uint64_t)SRC1(ip, i*64+6) << 42;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*39+ 4) = (uint64_t)SRC(ip, i*64+ 6) >> 22;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*39+ 4) |= (uint64_t)SRC(ip, i*64+ 7) << 17 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*39+ 5) = (uint64_t)SRC(ip, i*64+ 8) >> 8 | (uint64_t)SRC1(ip, i*64+9) << 31;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*39+ 6) = (uint64_t)SRC(ip, i*64+ 9) >> 33;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*39+ 6) |= (uint64_t)SRC(ip, i*64+10) << 6 | (uint64_t)SRC1(ip, i*64+11) << 45;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*39+ 7) = (uint64_t)SRC(ip, i*64+11) >> 19;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*39+ 7) |= (uint64_t)SRC(ip, i*64+12) << 20 | (uint64_t)SRC1(ip, i*64+13) << 59;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*39+ 8) = (uint64_t)SRC(ip, i*64+13) >> 5 | (uint64_t)SRC1(ip, i*64+14) << 34;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*39+ 9) = (uint64_t)SRC(ip, i*64+14) >> 30;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*39+ 9) |= (uint64_t)SRC(ip, i*64+15) << 9 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*39+10) = (uint64_t)SRC(ip, i*64+16) >> 16;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*39+10) |= (uint64_t)SRC(ip, i*64+17) << 23 | (uint64_t)SRC1(ip, i*64+18) << 62;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*39+11) = (uint64_t)SRC(ip, i*64+18) >> 2 | (uint64_t)SRC1(ip, i*64+19) << 37;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*39+12) = (uint64_t)SRC(ip, i*64+19) >> 27;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*39+12) |= (uint64_t)SRC(ip, i*64+20) << 12 | (uint64_t)SRC1(ip, i*64+21) << 51;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*39+13) = (uint64_t)SRC(ip, i*64+21) >> 13 | (uint64_t)SRC1(ip, i*64+22) << 26;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*39+14) = (uint64_t)SRC(ip, i*64+22) >> 38;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*39+14) |= (uint64_t)SRC(ip, i*64+23) << 1 | (uint64_t)SRC1(ip, i*64+24) << 40;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*39+15) = (uint64_t)SRC(ip, i*64+24) >> 24;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*39+15) |= (uint64_t)SRC(ip, i*64+25) << 15 | (uint64_t)SRC1(ip, i*64+26) << 54;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*39+16) = (uint64_t)SRC(ip, i*64+26) >> 10 | (uint64_t)SRC1(ip, i*64+27) << 29;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*39+17) = (uint64_t)SRC(ip, i*64+27) >> 35;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*39+17) |= (uint64_t)SRC(ip, i*64+28) << 4 | (uint64_t)SRC1(ip, i*64+29) << 43;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*39+18) = (uint64_t)SRC(ip, i*64+29) >> 21;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*39+18) |= (uint64_t)SRC(ip, i*64+30) << 18 | (uint64_t)SRC1(ip, i*64+31) << 57;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*39+19) = (uint64_t)SRC(ip, i*64+31) >> 7;\ -} - -#define BITPACK64_39(ip, op, parm) { \ - BITBLK64_39(ip, 0, op, parm); SRCI(ip); op += 39*4/sizeof(op[0]);\ -} - -#define BITBLK64_40(ip, i, op, parm) { ;\ - IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*5+ 0) = (uint64_t)SRC(ip, i*8+ 0) | (uint64_t)SRC1(ip, i*8+1) << 40;\ - IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*5+ 1) = (uint64_t)SRC(ip, i*8+ 1) >> 24;\ - IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*5+ 1) |= (uint64_t)SRC(ip, i*8+ 2) << 16 | (uint64_t)SRC1(ip, i*8+3) << 56;\ - IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*5+ 2) = (uint64_t)SRC(ip, i*8+ 3) >> 8 | (uint64_t)SRC1(ip, i*8+4) << 32;\ - IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*5+ 3) = (uint64_t)SRC(ip, i*8+ 4) >> 32;\ - IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*5+ 3) |= (uint64_t)SRC(ip, i*8+ 5) << 8 | (uint64_t)SRC1(ip, i*8+6) << 48;\ - IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*5+ 4) = (uint64_t)SRC(ip, i*8+ 6) >> 16;\ - IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*5+ 4) |= (uint64_t)SRC(ip, i*8+ 7) << 24;\ -} - -#define BITPACK64_40(ip, op, parm) { \ - BITBLK64_40(ip, 0, op, parm);\ - BITBLK64_40(ip, 1, op, parm);\ - BITBLK64_40(ip, 2, op, parm);\ - BITBLK64_40(ip, 3, op, parm); SRCI(ip); op += 40*4/sizeof(op[0]);\ -} - -#define BITBLK64_41(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*41+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 41;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*41+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 23;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*41+ 1) |= (uint64_t)SRC(ip, i*64+ 2) << 18 | (uint64_t)SRC1(ip, i*64+3) << 59;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*41+ 2) = (uint64_t)SRC(ip, i*64+ 3) >> 5 | (uint64_t)SRC1(ip, i*64+4) << 36;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*41+ 3) = (uint64_t)SRC(ip, i*64+ 4) >> 28;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*41+ 3) |= (uint64_t)SRC(ip, i*64+ 5) << 13 | (uint64_t)SRC1(ip, i*64+6) << 54;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*41+ 4) = (uint64_t)SRC(ip, i*64+ 6) >> 10 | (uint64_t)SRC1(ip, i*64+7) << 31;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*41+ 5) = (uint64_t)SRC(ip, i*64+ 7) >> 33;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*41+ 5) |= (uint64_t)SRC(ip, i*64+ 8) << 8 | (uint64_t)SRC1(ip, i*64+9) << 49;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*41+ 6) = (uint64_t)SRC(ip, i*64+ 9) >> 15 | (uint64_t)SRC1(ip, i*64+10) << 26;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*41+ 7) = (uint64_t)SRC(ip, i*64+10) >> 38;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*41+ 7) |= (uint64_t)SRC(ip, i*64+11) << 3 | (uint64_t)SRC1(ip, i*64+12) << 44;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*41+ 8) = (uint64_t)SRC(ip, i*64+12) >> 20;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*41+ 8) |= (uint64_t)SRC(ip, i*64+13) << 21 | (uint64_t)SRC1(ip, i*64+14) << 62;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*41+ 9) = (uint64_t)SRC(ip, i*64+14) >> 2 | (uint64_t)SRC1(ip, i*64+15) << 39;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*41+10) = (uint64_t)SRC(ip, i*64+15) >> 25;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*41+10) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 57;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*41+11) = (uint64_t)SRC(ip, i*64+17) >> 7 | (uint64_t)SRC1(ip, i*64+18) << 34;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*41+12) = (uint64_t)SRC(ip, i*64+18) >> 30;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*41+12) |= (uint64_t)SRC(ip, i*64+19) << 11 | (uint64_t)SRC1(ip, i*64+20) << 52;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*41+13) = (uint64_t)SRC(ip, i*64+20) >> 12 | (uint64_t)SRC1(ip, i*64+21) << 29;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*41+14) = (uint64_t)SRC(ip, i*64+21) >> 35;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*41+14) |= (uint64_t)SRC(ip, i*64+22) << 6 | (uint64_t)SRC1(ip, i*64+23) << 47;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*41+15) = (uint64_t)SRC(ip, i*64+23) >> 17 | (uint64_t)SRC1(ip, i*64+24) << 24;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*41+16) = (uint64_t)SRC(ip, i*64+24) >> 40;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*41+16) |= (uint64_t)SRC(ip, i*64+25) << 1 | (uint64_t)SRC1(ip, i*64+26) << 42;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*41+17) = (uint64_t)SRC(ip, i*64+26) >> 22;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*41+17) |= (uint64_t)SRC(ip, i*64+27) << 19 | (uint64_t)SRC1(ip, i*64+28) << 60;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*41+18) = (uint64_t)SRC(ip, i*64+28) >> 4 | (uint64_t)SRC1(ip, i*64+29) << 37;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*41+19) = (uint64_t)SRC(ip, i*64+29) >> 27;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*41+19) |= (uint64_t)SRC(ip, i*64+30) << 14 | (uint64_t)SRC1(ip, i*64+31) << 55;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*41+20) = (uint64_t)SRC(ip, i*64+31) >> 9;\ -} - -#define BITPACK64_41(ip, op, parm) { \ - BITBLK64_41(ip, 0, op, parm); SRCI(ip); op += 41*4/sizeof(op[0]);\ -} - -#define BITBLK64_42(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*21+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 42;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*21+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 22;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*21+ 1) |= (uint64_t)SRC(ip, i*32+ 2) << 20 | (uint64_t)SRC1(ip, i*32+3) << 62;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*21+ 2) = (uint64_t)SRC(ip, i*32+ 3) >> 2 | (uint64_t)SRC1(ip, i*32+4) << 40;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*21+ 3) = (uint64_t)SRC(ip, i*32+ 4) >> 24;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*21+ 3) |= (uint64_t)SRC(ip, i*32+ 5) << 18 | (uint64_t)SRC1(ip, i*32+6) << 60;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*21+ 4) = (uint64_t)SRC(ip, i*32+ 6) >> 4 | (uint64_t)SRC1(ip, i*32+7) << 38;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*21+ 5) = (uint64_t)SRC(ip, i*32+ 7) >> 26;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*21+ 5) |= (uint64_t)SRC(ip, i*32+ 8) << 16 | (uint64_t)SRC1(ip, i*32+9) << 58;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*21+ 6) = (uint64_t)SRC(ip, i*32+ 9) >> 6 | (uint64_t)SRC1(ip, i*32+10) << 36;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*21+ 7) = (uint64_t)SRC(ip, i*32+10) >> 28;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*21+ 7) |= (uint64_t)SRC(ip, i*32+11) << 14 | (uint64_t)SRC1(ip, i*32+12) << 56;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*21+ 8) = (uint64_t)SRC(ip, i*32+12) >> 8 | (uint64_t)SRC1(ip, i*32+13) << 34;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*21+ 9) = (uint64_t)SRC(ip, i*32+13) >> 30;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*21+ 9) |= (uint64_t)SRC(ip, i*32+14) << 12 | (uint64_t)SRC1(ip, i*32+15) << 54;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*21+10) = (uint64_t)SRC(ip, i*32+15) >> 10 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*21+11) = (uint64_t)SRC(ip, i*32+16) >> 32;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*21+11) |= (uint64_t)SRC(ip, i*32+17) << 10 | (uint64_t)SRC1(ip, i*32+18) << 52;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*21+12) = (uint64_t)SRC(ip, i*32+18) >> 12 | (uint64_t)SRC1(ip, i*32+19) << 30;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*21+13) = (uint64_t)SRC(ip, i*32+19) >> 34;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*21+13) |= (uint64_t)SRC(ip, i*32+20) << 8 | (uint64_t)SRC1(ip, i*32+21) << 50;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*21+14) = (uint64_t)SRC(ip, i*32+21) >> 14 | (uint64_t)SRC1(ip, i*32+22) << 28;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*21+15) = (uint64_t)SRC(ip, i*32+22) >> 36;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*21+15) |= (uint64_t)SRC(ip, i*32+23) << 6 | (uint64_t)SRC1(ip, i*32+24) << 48;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*21+16) = (uint64_t)SRC(ip, i*32+24) >> 16 | (uint64_t)SRC1(ip, i*32+25) << 26;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*21+17) = (uint64_t)SRC(ip, i*32+25) >> 38;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*21+17) |= (uint64_t)SRC(ip, i*32+26) << 4 | (uint64_t)SRC1(ip, i*32+27) << 46;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*21+18) = (uint64_t)SRC(ip, i*32+27) >> 18 | (uint64_t)SRC1(ip, i*32+28) << 24;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*21+19) = (uint64_t)SRC(ip, i*32+28) >> 40;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*21+19) |= (uint64_t)SRC(ip, i*32+29) << 2 | (uint64_t)SRC1(ip, i*32+30) << 44;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*21+20) = (uint64_t)SRC(ip, i*32+30) >> 20;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*21+20) |= (uint64_t)SRC(ip, i*32+31) << 22;\ -} - -#define BITPACK64_42(ip, op, parm) { \ - BITBLK64_42(ip, 0, op, parm); SRCI(ip); op += 42*4/sizeof(op[0]);\ -} - -#define BITBLK64_43(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*43+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 43;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*43+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 21 | (uint64_t)SRC1(ip, i*64+2) << 22;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*43+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 42;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*43+ 2) |= (uint64_t)SRC(ip, i*64+ 3) << 1 | (uint64_t)SRC1(ip, i*64+4) << 44;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*43+ 3) = (uint64_t)SRC(ip, i*64+ 4) >> 20 | (uint64_t)SRC1(ip, i*64+5) << 23;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*43+ 4) = (uint64_t)SRC(ip, i*64+ 5) >> 41;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*43+ 4) |= (uint64_t)SRC(ip, i*64+ 6) << 2 | (uint64_t)SRC1(ip, i*64+7) << 45;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*43+ 5) = (uint64_t)SRC(ip, i*64+ 7) >> 19 | (uint64_t)SRC1(ip, i*64+8) << 24;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*43+ 6) = (uint64_t)SRC(ip, i*64+ 8) >> 40;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*43+ 6) |= (uint64_t)SRC(ip, i*64+ 9) << 3 | (uint64_t)SRC1(ip, i*64+10) << 46;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*43+ 7) = (uint64_t)SRC(ip, i*64+10) >> 18 | (uint64_t)SRC1(ip, i*64+11) << 25;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*43+ 8) = (uint64_t)SRC(ip, i*64+11) >> 39;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*43+ 8) |= (uint64_t)SRC(ip, i*64+12) << 4 | (uint64_t)SRC1(ip, i*64+13) << 47;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*43+ 9) = (uint64_t)SRC(ip, i*64+13) >> 17 | (uint64_t)SRC1(ip, i*64+14) << 26;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*43+10) = (uint64_t)SRC(ip, i*64+14) >> 38;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*43+10) |= (uint64_t)SRC(ip, i*64+15) << 5 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*43+11) = (uint64_t)SRC(ip, i*64+16) >> 16 | (uint64_t)SRC1(ip, i*64+17) << 27;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*43+12) = (uint64_t)SRC(ip, i*64+17) >> 37;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*43+12) |= (uint64_t)SRC(ip, i*64+18) << 6 | (uint64_t)SRC1(ip, i*64+19) << 49;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*43+13) = (uint64_t)SRC(ip, i*64+19) >> 15 | (uint64_t)SRC1(ip, i*64+20) << 28;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*43+14) = (uint64_t)SRC(ip, i*64+20) >> 36;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*43+14) |= (uint64_t)SRC(ip, i*64+21) << 7 | (uint64_t)SRC1(ip, i*64+22) << 50;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*43+15) = (uint64_t)SRC(ip, i*64+22) >> 14 | (uint64_t)SRC1(ip, i*64+23) << 29;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*43+16) = (uint64_t)SRC(ip, i*64+23) >> 35;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*43+16) |= (uint64_t)SRC(ip, i*64+24) << 8 | (uint64_t)SRC1(ip, i*64+25) << 51;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*43+17) = (uint64_t)SRC(ip, i*64+25) >> 13 | (uint64_t)SRC1(ip, i*64+26) << 30;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*43+18) = (uint64_t)SRC(ip, i*64+26) >> 34;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*43+18) |= (uint64_t)SRC(ip, i*64+27) << 9 | (uint64_t)SRC1(ip, i*64+28) << 52;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*43+19) = (uint64_t)SRC(ip, i*64+28) >> 12 | (uint64_t)SRC1(ip, i*64+29) << 31;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*43+20) = (uint64_t)SRC(ip, i*64+29) >> 33;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*43+20) |= (uint64_t)SRC(ip, i*64+30) << 10 | (uint64_t)SRC1(ip, i*64+31) << 53;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*43+21) = (uint64_t)SRC(ip, i*64+31) >> 11;\ -} - -#define BITPACK64_43(ip, op, parm) { \ - BITBLK64_43(ip, 0, op, parm); SRCI(ip); op += 43*4/sizeof(op[0]);\ -} - -#define BITBLK64_44(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*11+ 0) = (uint64_t)SRC(ip, i*16+ 0) | (uint64_t)SRC1(ip, i*16+1) << 44;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*11+ 1) = (uint64_t)SRC(ip, i*16+ 1) >> 20 | (uint64_t)SRC1(ip, i*16+2) << 24;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*11+ 2) = (uint64_t)SRC(ip, i*16+ 2) >> 40;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*11+ 2) |= (uint64_t)SRC(ip, i*16+ 3) << 4 | (uint64_t)SRC1(ip, i*16+4) << 48;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*11+ 3) = (uint64_t)SRC(ip, i*16+ 4) >> 16 | (uint64_t)SRC1(ip, i*16+5) << 28;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*11+ 4) = (uint64_t)SRC(ip, i*16+ 5) >> 36;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*11+ 4) |= (uint64_t)SRC(ip, i*16+ 6) << 8 | (uint64_t)SRC1(ip, i*16+7) << 52;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*11+ 5) = (uint64_t)SRC(ip, i*16+ 7) >> 12 | (uint64_t)SRC1(ip, i*16+8) << 32;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*11+ 6) = (uint64_t)SRC(ip, i*16+ 8) >> 32;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*11+ 6) |= (uint64_t)SRC(ip, i*16+ 9) << 12 | (uint64_t)SRC1(ip, i*16+10) << 56;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*11+ 7) = (uint64_t)SRC(ip, i*16+10) >> 8 | (uint64_t)SRC1(ip, i*16+11) << 36;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*11+ 8) = (uint64_t)SRC(ip, i*16+11) >> 28;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*11+ 8) |= (uint64_t)SRC(ip, i*16+12) << 16 | (uint64_t)SRC1(ip, i*16+13) << 60;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*11+ 9) = (uint64_t)SRC(ip, i*16+13) >> 4 | (uint64_t)SRC1(ip, i*16+14) << 40;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*11+10) = (uint64_t)SRC(ip, i*16+14) >> 24;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*11+10) |= (uint64_t)SRC(ip, i*16+15) << 20;\ -} - -#define BITPACK64_44(ip, op, parm) { \ - BITBLK64_44(ip, 0, op, parm);\ - BITBLK64_44(ip, 1, op, parm); SRCI(ip); op += 44*4/sizeof(op[0]);\ -} - -#define BITBLK64_45(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*45+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 45;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*45+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 19 | (uint64_t)SRC1(ip, i*64+2) << 26;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*45+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 38;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*45+ 2) |= (uint64_t)SRC(ip, i*64+ 3) << 7 | (uint64_t)SRC1(ip, i*64+4) << 52;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*45+ 3) = (uint64_t)SRC(ip, i*64+ 4) >> 12 | (uint64_t)SRC1(ip, i*64+5) << 33;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*45+ 4) = (uint64_t)SRC(ip, i*64+ 5) >> 31;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*45+ 4) |= (uint64_t)SRC(ip, i*64+ 6) << 14 | (uint64_t)SRC1(ip, i*64+7) << 59;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*45+ 5) = (uint64_t)SRC(ip, i*64+ 7) >> 5 | (uint64_t)SRC1(ip, i*64+8) << 40;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*45+ 6) = (uint64_t)SRC(ip, i*64+ 8) >> 24 | (uint64_t)SRC1(ip, i*64+9) << 21;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*45+ 7) = (uint64_t)SRC(ip, i*64+ 9) >> 43;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*45+ 7) |= (uint64_t)SRC(ip, i*64+10) << 2 | (uint64_t)SRC1(ip, i*64+11) << 47;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*45+ 8) = (uint64_t)SRC(ip, i*64+11) >> 17 | (uint64_t)SRC1(ip, i*64+12) << 28;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*45+ 9) = (uint64_t)SRC(ip, i*64+12) >> 36;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*45+ 9) |= (uint64_t)SRC(ip, i*64+13) << 9 | (uint64_t)SRC1(ip, i*64+14) << 54;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*45+10) = (uint64_t)SRC(ip, i*64+14) >> 10 | (uint64_t)SRC1(ip, i*64+15) << 35;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*45+11) = (uint64_t)SRC(ip, i*64+15) >> 29;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*45+11) |= (uint64_t)SRC(ip, i*64+16) << 16 | (uint64_t)SRC1(ip, i*64+17) << 61;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*45+12) = (uint64_t)SRC(ip, i*64+17) >> 3 | (uint64_t)SRC1(ip, i*64+18) << 42;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*45+13) = (uint64_t)SRC(ip, i*64+18) >> 22 | (uint64_t)SRC1(ip, i*64+19) << 23;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*45+14) = (uint64_t)SRC(ip, i*64+19) >> 41;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*45+14) |= (uint64_t)SRC(ip, i*64+20) << 4 | (uint64_t)SRC1(ip, i*64+21) << 49;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*45+15) = (uint64_t)SRC(ip, i*64+21) >> 15 | (uint64_t)SRC1(ip, i*64+22) << 30;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*45+16) = (uint64_t)SRC(ip, i*64+22) >> 34;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*45+16) |= (uint64_t)SRC(ip, i*64+23) << 11 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*45+17) = (uint64_t)SRC(ip, i*64+24) >> 8 | (uint64_t)SRC1(ip, i*64+25) << 37;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*45+18) = (uint64_t)SRC(ip, i*64+25) >> 27;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*45+18) |= (uint64_t)SRC(ip, i*64+26) << 18 | (uint64_t)SRC1(ip, i*64+27) << 63;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*45+19) = (uint64_t)SRC(ip, i*64+27) >> 1 | (uint64_t)SRC1(ip, i*64+28) << 44;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*45+20) = (uint64_t)SRC(ip, i*64+28) >> 20 | (uint64_t)SRC1(ip, i*64+29) << 25;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*45+21) = (uint64_t)SRC(ip, i*64+29) >> 39;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*45+21) |= (uint64_t)SRC(ip, i*64+30) << 6 | (uint64_t)SRC1(ip, i*64+31) << 51;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*45+22) = (uint64_t)SRC(ip, i*64+31) >> 13;\ -} - -#define BITPACK64_45(ip, op, parm) { \ - BITBLK64_45(ip, 0, op, parm); SRCI(ip); op += 45*4/sizeof(op[0]);\ -} - -#define BITBLK64_46(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*23+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 46;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*23+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 18 | (uint64_t)SRC1(ip, i*32+2) << 28;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*23+ 2) = (uint64_t)SRC(ip, i*32+ 2) >> 36;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*23+ 2) |= (uint64_t)SRC(ip, i*32+ 3) << 10 | (uint64_t)SRC1(ip, i*32+4) << 56;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*23+ 3) = (uint64_t)SRC(ip, i*32+ 4) >> 8 | (uint64_t)SRC1(ip, i*32+5) << 38;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*23+ 4) = (uint64_t)SRC(ip, i*32+ 5) >> 26 | (uint64_t)SRC1(ip, i*32+6) << 20;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*23+ 5) = (uint64_t)SRC(ip, i*32+ 6) >> 44;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*23+ 5) |= (uint64_t)SRC(ip, i*32+ 7) << 2 | (uint64_t)SRC1(ip, i*32+8) << 48;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*23+ 6) = (uint64_t)SRC(ip, i*32+ 8) >> 16 | (uint64_t)SRC1(ip, i*32+9) << 30;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*23+ 7) = (uint64_t)SRC(ip, i*32+ 9) >> 34;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*23+ 7) |= (uint64_t)SRC(ip, i*32+10) << 12 | (uint64_t)SRC1(ip, i*32+11) << 58;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*23+ 8) = (uint64_t)SRC(ip, i*32+11) >> 6 | (uint64_t)SRC1(ip, i*32+12) << 40;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*23+ 9) = (uint64_t)SRC(ip, i*32+12) >> 24 | (uint64_t)SRC1(ip, i*32+13) << 22;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*23+10) = (uint64_t)SRC(ip, i*32+13) >> 42;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*23+10) |= (uint64_t)SRC(ip, i*32+14) << 4 | (uint64_t)SRC1(ip, i*32+15) << 50;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*23+11) = (uint64_t)SRC(ip, i*32+15) >> 14 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*23+12) = (uint64_t)SRC(ip, i*32+16) >> 32;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*23+12) |= (uint64_t)SRC(ip, i*32+17) << 14 | (uint64_t)SRC1(ip, i*32+18) << 60;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*23+13) = (uint64_t)SRC(ip, i*32+18) >> 4 | (uint64_t)SRC1(ip, i*32+19) << 42;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*23+14) = (uint64_t)SRC(ip, i*32+19) >> 22 | (uint64_t)SRC1(ip, i*32+20) << 24;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*23+15) = (uint64_t)SRC(ip, i*32+20) >> 40;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*23+15) |= (uint64_t)SRC(ip, i*32+21) << 6 | (uint64_t)SRC1(ip, i*32+22) << 52;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*23+16) = (uint64_t)SRC(ip, i*32+22) >> 12 | (uint64_t)SRC1(ip, i*32+23) << 34;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*23+17) = (uint64_t)SRC(ip, i*32+23) >> 30;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*23+17) |= (uint64_t)SRC(ip, i*32+24) << 16 | (uint64_t)SRC1(ip, i*32+25) << 62;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*23+18) = (uint64_t)SRC(ip, i*32+25) >> 2 | (uint64_t)SRC1(ip, i*32+26) << 44;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*23+19) = (uint64_t)SRC(ip, i*32+26) >> 20 | (uint64_t)SRC1(ip, i*32+27) << 26;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*23+20) = (uint64_t)SRC(ip, i*32+27) >> 38;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*23+20) |= (uint64_t)SRC(ip, i*32+28) << 8 | (uint64_t)SRC1(ip, i*32+29) << 54;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*23+21) = (uint64_t)SRC(ip, i*32+29) >> 10 | (uint64_t)SRC1(ip, i*32+30) << 36;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*23+22) = (uint64_t)SRC(ip, i*32+30) >> 28;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*23+22) |= (uint64_t)SRC(ip, i*32+31) << 18;\ -} - -#define BITPACK64_46(ip, op, parm) { \ - BITBLK64_46(ip, 0, op, parm); SRCI(ip); op += 46*4/sizeof(op[0]);\ -} - -#define BITBLK64_47(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*47+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 47;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*47+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 17 | (uint64_t)SRC1(ip, i*64+2) << 30;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*47+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 34;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*47+ 2) |= (uint64_t)SRC(ip, i*64+ 3) << 13 | (uint64_t)SRC1(ip, i*64+4) << 60;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*47+ 3) = (uint64_t)SRC(ip, i*64+ 4) >> 4 | (uint64_t)SRC1(ip, i*64+5) << 43;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*47+ 4) = (uint64_t)SRC(ip, i*64+ 5) >> 21 | (uint64_t)SRC1(ip, i*64+6) << 26;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*47+ 5) = (uint64_t)SRC(ip, i*64+ 6) >> 38;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*47+ 5) |= (uint64_t)SRC(ip, i*64+ 7) << 9 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*47+ 6) = (uint64_t)SRC(ip, i*64+ 8) >> 8 | (uint64_t)SRC1(ip, i*64+9) << 39;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*47+ 7) = (uint64_t)SRC(ip, i*64+ 9) >> 25 | (uint64_t)SRC1(ip, i*64+10) << 22;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*47+ 8) = (uint64_t)SRC(ip, i*64+10) >> 42;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*47+ 8) |= (uint64_t)SRC(ip, i*64+11) << 5 | (uint64_t)SRC1(ip, i*64+12) << 52;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*47+ 9) = (uint64_t)SRC(ip, i*64+12) >> 12 | (uint64_t)SRC1(ip, i*64+13) << 35;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*47+10) = (uint64_t)SRC(ip, i*64+13) >> 29 | (uint64_t)SRC1(ip, i*64+14) << 18;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*47+11) = (uint64_t)SRC(ip, i*64+14) >> 46;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*47+11) |= (uint64_t)SRC(ip, i*64+15) << 1 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*47+12) = (uint64_t)SRC(ip, i*64+16) >> 16 | (uint64_t)SRC1(ip, i*64+17) << 31;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*47+13) = (uint64_t)SRC(ip, i*64+17) >> 33;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*47+13) |= (uint64_t)SRC(ip, i*64+18) << 14 | (uint64_t)SRC1(ip, i*64+19) << 61;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*47+14) = (uint64_t)SRC(ip, i*64+19) >> 3 | (uint64_t)SRC1(ip, i*64+20) << 44;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*47+15) = (uint64_t)SRC(ip, i*64+20) >> 20 | (uint64_t)SRC1(ip, i*64+21) << 27;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*47+16) = (uint64_t)SRC(ip, i*64+21) >> 37;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*47+16) |= (uint64_t)SRC(ip, i*64+22) << 10 | (uint64_t)SRC1(ip, i*64+23) << 57;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*47+17) = (uint64_t)SRC(ip, i*64+23) >> 7 | (uint64_t)SRC1(ip, i*64+24) << 40;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*47+18) = (uint64_t)SRC(ip, i*64+24) >> 24 | (uint64_t)SRC1(ip, i*64+25) << 23;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*47+19) = (uint64_t)SRC(ip, i*64+25) >> 41;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*47+19) |= (uint64_t)SRC(ip, i*64+26) << 6 | (uint64_t)SRC1(ip, i*64+27) << 53;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*47+20) = (uint64_t)SRC(ip, i*64+27) >> 11 | (uint64_t)SRC1(ip, i*64+28) << 36;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*47+21) = (uint64_t)SRC(ip, i*64+28) >> 28 | (uint64_t)SRC1(ip, i*64+29) << 19;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*47+22) = (uint64_t)SRC(ip, i*64+29) >> 45;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*47+22) |= (uint64_t)SRC(ip, i*64+30) << 2 | (uint64_t)SRC1(ip, i*64+31) << 49;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*47+23) = (uint64_t)SRC(ip, i*64+31) >> 15;\ -} - -#define BITPACK64_47(ip, op, parm) { \ - BITBLK64_47(ip, 0, op, parm); SRCI(ip); op += 47*4/sizeof(op[0]);\ -} - -#define BITBLK64_48(ip, i, op, parm) { ;\ - IPPB(ip, i*4+ 0, parm); *((uint64_t *)op+i*3+ 0) = (uint64_t)SRC(ip, i*4+ 0) | (uint64_t)SRC1(ip, i*4+1) << 48;\ - IPPB(ip, i*4+ 1, parm); *((uint64_t *)op+i*3+ 1) = (uint64_t)SRC(ip, i*4+ 1) >> 16 | (uint64_t)SRC1(ip, i*4+2) << 32;\ - IPPB(ip, i*4+ 2, parm); *((uint64_t *)op+i*3+ 2) = (uint64_t)SRC(ip, i*4+ 2) >> 32;\ - IPPB(ip, i*4+ 3, parm); *((uint64_t *)op+i*3+ 2) |= (uint64_t)SRC(ip, i*4+ 3) << 16;\ -} - -#define BITPACK64_48(ip, op, parm) { \ - BITBLK64_48(ip, 0, op, parm);\ - BITBLK64_48(ip, 1, op, parm);\ - BITBLK64_48(ip, 2, op, parm);\ - BITBLK64_48(ip, 3, op, parm);\ - BITBLK64_48(ip, 4, op, parm);\ - BITBLK64_48(ip, 5, op, parm);\ - BITBLK64_48(ip, 6, op, parm);\ - BITBLK64_48(ip, 7, op, parm); SRCI(ip); op += 48*4/sizeof(op[0]);\ -} - -#define BITBLK64_49(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*49+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 49;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*49+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 15 | (uint64_t)SRC1(ip, i*64+2) << 34;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*49+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 30 | (uint64_t)SRC1(ip, i*64+3) << 19;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*49+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 45;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*49+ 3) |= (uint64_t)SRC(ip, i*64+ 4) << 4 | (uint64_t)SRC1(ip, i*64+5) << 53;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*49+ 4) = (uint64_t)SRC(ip, i*64+ 5) >> 11 | (uint64_t)SRC1(ip, i*64+6) << 38;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*49+ 5) = (uint64_t)SRC(ip, i*64+ 6) >> 26 | (uint64_t)SRC1(ip, i*64+7) << 23;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*49+ 6) = (uint64_t)SRC(ip, i*64+ 7) >> 41;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*49+ 6) |= (uint64_t)SRC(ip, i*64+ 8) << 8 | (uint64_t)SRC1(ip, i*64+9) << 57;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*49+ 7) = (uint64_t)SRC(ip, i*64+ 9) >> 7 | (uint64_t)SRC1(ip, i*64+10) << 42;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*49+ 8) = (uint64_t)SRC(ip, i*64+10) >> 22 | (uint64_t)SRC1(ip, i*64+11) << 27;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*49+ 9) = (uint64_t)SRC(ip, i*64+11) >> 37;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*49+ 9) |= (uint64_t)SRC(ip, i*64+12) << 12 | (uint64_t)SRC1(ip, i*64+13) << 61;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*49+10) = (uint64_t)SRC(ip, i*64+13) >> 3 | (uint64_t)SRC1(ip, i*64+14) << 46;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*49+11) = (uint64_t)SRC(ip, i*64+14) >> 18 | (uint64_t)SRC1(ip, i*64+15) << 31;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*49+12) = (uint64_t)SRC(ip, i*64+15) >> 33 | (uint64_t)SRC1(ip, i*64+16) << 16;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*49+13) = (uint64_t)SRC(ip, i*64+16) >> 48;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*49+13) |= (uint64_t)SRC(ip, i*64+17) << 1 | (uint64_t)SRC1(ip, i*64+18) << 50;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*49+14) = (uint64_t)SRC(ip, i*64+18) >> 14 | (uint64_t)SRC1(ip, i*64+19) << 35;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*49+15) = (uint64_t)SRC(ip, i*64+19) >> 29 | (uint64_t)SRC1(ip, i*64+20) << 20;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*49+16) = (uint64_t)SRC(ip, i*64+20) >> 44;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*49+16) |= (uint64_t)SRC(ip, i*64+21) << 5 | (uint64_t)SRC1(ip, i*64+22) << 54;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*49+17) = (uint64_t)SRC(ip, i*64+22) >> 10 | (uint64_t)SRC1(ip, i*64+23) << 39;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*49+18) = (uint64_t)SRC(ip, i*64+23) >> 25 | (uint64_t)SRC1(ip, i*64+24) << 24;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*49+19) = (uint64_t)SRC(ip, i*64+24) >> 40;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*49+19) |= (uint64_t)SRC(ip, i*64+25) << 9 | (uint64_t)SRC1(ip, i*64+26) << 58;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*49+20) = (uint64_t)SRC(ip, i*64+26) >> 6 | (uint64_t)SRC1(ip, i*64+27) << 43;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*49+21) = (uint64_t)SRC(ip, i*64+27) >> 21 | (uint64_t)SRC1(ip, i*64+28) << 28;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*49+22) = (uint64_t)SRC(ip, i*64+28) >> 36;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*49+22) |= (uint64_t)SRC(ip, i*64+29) << 13 | (uint64_t)SRC1(ip, i*64+30) << 62;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*49+23) = (uint64_t)SRC(ip, i*64+30) >> 2 | (uint64_t)SRC1(ip, i*64+31) << 47;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*49+24) = (uint64_t)SRC(ip, i*64+31) >> 17;\ -} - -#define BITPACK64_49(ip, op, parm) { \ - BITBLK64_49(ip, 0, op, parm); SRCI(ip); op += 49*4/sizeof(op[0]);\ -} - -#define BITBLK64_50(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*25+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 50;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*25+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 14 | (uint64_t)SRC1(ip, i*32+2) << 36;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*25+ 2) = (uint64_t)SRC(ip, i*32+ 2) >> 28 | (uint64_t)SRC1(ip, i*32+3) << 22;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*25+ 3) = (uint64_t)SRC(ip, i*32+ 3) >> 42;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*25+ 3) |= (uint64_t)SRC(ip, i*32+ 4) << 8 | (uint64_t)SRC1(ip, i*32+5) << 58;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*25+ 4) = (uint64_t)SRC(ip, i*32+ 5) >> 6 | (uint64_t)SRC1(ip, i*32+6) << 44;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*25+ 5) = (uint64_t)SRC(ip, i*32+ 6) >> 20 | (uint64_t)SRC1(ip, i*32+7) << 30;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*25+ 6) = (uint64_t)SRC(ip, i*32+ 7) >> 34 | (uint64_t)SRC1(ip, i*32+8) << 16;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*25+ 7) = (uint64_t)SRC(ip, i*32+ 8) >> 48;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*25+ 7) |= (uint64_t)SRC(ip, i*32+ 9) << 2 | (uint64_t)SRC1(ip, i*32+10) << 52;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*25+ 8) = (uint64_t)SRC(ip, i*32+10) >> 12 | (uint64_t)SRC1(ip, i*32+11) << 38;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*25+ 9) = (uint64_t)SRC(ip, i*32+11) >> 26 | (uint64_t)SRC1(ip, i*32+12) << 24;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*25+10) = (uint64_t)SRC(ip, i*32+12) >> 40;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*25+10) |= (uint64_t)SRC(ip, i*32+13) << 10 | (uint64_t)SRC1(ip, i*32+14) << 60;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*25+11) = (uint64_t)SRC(ip, i*32+14) >> 4 | (uint64_t)SRC1(ip, i*32+15) << 46;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*25+12) = (uint64_t)SRC(ip, i*32+15) >> 18 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*25+13) = (uint64_t)SRC(ip, i*32+16) >> 32 | (uint64_t)SRC1(ip, i*32+17) << 18;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*25+14) = (uint64_t)SRC(ip, i*32+17) >> 46;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*25+14) |= (uint64_t)SRC(ip, i*32+18) << 4 | (uint64_t)SRC1(ip, i*32+19) << 54;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*25+15) = (uint64_t)SRC(ip, i*32+19) >> 10 | (uint64_t)SRC1(ip, i*32+20) << 40;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*25+16) = (uint64_t)SRC(ip, i*32+20) >> 24 | (uint64_t)SRC1(ip, i*32+21) << 26;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*25+17) = (uint64_t)SRC(ip, i*32+21) >> 38;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*25+17) |= (uint64_t)SRC(ip, i*32+22) << 12 | (uint64_t)SRC1(ip, i*32+23) << 62;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*25+18) = (uint64_t)SRC(ip, i*32+23) >> 2 | (uint64_t)SRC1(ip, i*32+24) << 48;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*25+19) = (uint64_t)SRC(ip, i*32+24) >> 16 | (uint64_t)SRC1(ip, i*32+25) << 34;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*25+20) = (uint64_t)SRC(ip, i*32+25) >> 30 | (uint64_t)SRC1(ip, i*32+26) << 20;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*25+21) = (uint64_t)SRC(ip, i*32+26) >> 44;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*25+21) |= (uint64_t)SRC(ip, i*32+27) << 6 | (uint64_t)SRC1(ip, i*32+28) << 56;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*25+22) = (uint64_t)SRC(ip, i*32+28) >> 8 | (uint64_t)SRC1(ip, i*32+29) << 42;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*25+23) = (uint64_t)SRC(ip, i*32+29) >> 22 | (uint64_t)SRC1(ip, i*32+30) << 28;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*25+24) = (uint64_t)SRC(ip, i*32+30) >> 36;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*25+24) |= (uint64_t)SRC(ip, i*32+31) << 14;\ -} - -#define BITPACK64_50(ip, op, parm) { \ - BITBLK64_50(ip, 0, op, parm); SRCI(ip); op += 50*4/sizeof(op[0]);\ -} - -#define BITBLK64_51(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*51+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 51;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*51+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 13 | (uint64_t)SRC1(ip, i*64+2) << 38;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*51+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 26 | (uint64_t)SRC1(ip, i*64+3) << 25;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*51+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 39;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*51+ 3) |= (uint64_t)SRC(ip, i*64+ 4) << 12 | (uint64_t)SRC1(ip, i*64+5) << 63;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*51+ 4) = (uint64_t)SRC(ip, i*64+ 5) >> 1 | (uint64_t)SRC1(ip, i*64+6) << 50;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*51+ 5) = (uint64_t)SRC(ip, i*64+ 6) >> 14 | (uint64_t)SRC1(ip, i*64+7) << 37;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*51+ 6) = (uint64_t)SRC(ip, i*64+ 7) >> 27 | (uint64_t)SRC1(ip, i*64+8) << 24;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*51+ 7) = (uint64_t)SRC(ip, i*64+ 8) >> 40;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*51+ 7) |= (uint64_t)SRC(ip, i*64+ 9) << 11 | (uint64_t)SRC1(ip, i*64+10) << 62;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*51+ 8) = (uint64_t)SRC(ip, i*64+10) >> 2 | (uint64_t)SRC1(ip, i*64+11) << 49;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*51+ 9) = (uint64_t)SRC(ip, i*64+11) >> 15 | (uint64_t)SRC1(ip, i*64+12) << 36;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*51+10) = (uint64_t)SRC(ip, i*64+12) >> 28 | (uint64_t)SRC1(ip, i*64+13) << 23;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*51+11) = (uint64_t)SRC(ip, i*64+13) >> 41;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*51+11) |= (uint64_t)SRC(ip, i*64+14) << 10 | (uint64_t)SRC1(ip, i*64+15) << 61;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*51+12) = (uint64_t)SRC(ip, i*64+15) >> 3 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*51+13) = (uint64_t)SRC(ip, i*64+16) >> 16 | (uint64_t)SRC1(ip, i*64+17) << 35;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*51+14) = (uint64_t)SRC(ip, i*64+17) >> 29 | (uint64_t)SRC1(ip, i*64+18) << 22;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*51+15) = (uint64_t)SRC(ip, i*64+18) >> 42;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*51+15) |= (uint64_t)SRC(ip, i*64+19) << 9 | (uint64_t)SRC1(ip, i*64+20) << 60;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*51+16) = (uint64_t)SRC(ip, i*64+20) >> 4 | (uint64_t)SRC1(ip, i*64+21) << 47;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*51+17) = (uint64_t)SRC(ip, i*64+21) >> 17 | (uint64_t)SRC1(ip, i*64+22) << 34;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*51+18) = (uint64_t)SRC(ip, i*64+22) >> 30 | (uint64_t)SRC1(ip, i*64+23) << 21;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*51+19) = (uint64_t)SRC(ip, i*64+23) >> 43;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*51+19) |= (uint64_t)SRC(ip, i*64+24) << 8 | (uint64_t)SRC1(ip, i*64+25) << 59;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*51+20) = (uint64_t)SRC(ip, i*64+25) >> 5 | (uint64_t)SRC1(ip, i*64+26) << 46;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*51+21) = (uint64_t)SRC(ip, i*64+26) >> 18 | (uint64_t)SRC1(ip, i*64+27) << 33;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*51+22) = (uint64_t)SRC(ip, i*64+27) >> 31 | (uint64_t)SRC1(ip, i*64+28) << 20;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*51+23) = (uint64_t)SRC(ip, i*64+28) >> 44;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*51+23) |= (uint64_t)SRC(ip, i*64+29) << 7 | (uint64_t)SRC1(ip, i*64+30) << 58;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*51+24) = (uint64_t)SRC(ip, i*64+30) >> 6 | (uint64_t)SRC1(ip, i*64+31) << 45;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*51+25) = (uint64_t)SRC(ip, i*64+31) >> 19;\ -} - -#define BITPACK64_51(ip, op, parm) { \ - BITBLK64_51(ip, 0, op, parm); SRCI(ip); op += 51*4/sizeof(op[0]);\ -} - -#define BITBLK64_52(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*13+ 0) = (uint64_t)SRC(ip, i*16+ 0) | (uint64_t)SRC1(ip, i*16+1) << 52;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*13+ 1) = (uint64_t)SRC(ip, i*16+ 1) >> 12 | (uint64_t)SRC1(ip, i*16+2) << 40;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*13+ 2) = (uint64_t)SRC(ip, i*16+ 2) >> 24 | (uint64_t)SRC1(ip, i*16+3) << 28;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*13+ 3) = (uint64_t)SRC(ip, i*16+ 3) >> 36 | (uint64_t)SRC1(ip, i*16+4) << 16;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*13+ 4) = (uint64_t)SRC(ip, i*16+ 4) >> 48;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*13+ 4) |= (uint64_t)SRC(ip, i*16+ 5) << 4 | (uint64_t)SRC1(ip, i*16+6) << 56;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*13+ 5) = (uint64_t)SRC(ip, i*16+ 6) >> 8 | (uint64_t)SRC1(ip, i*16+7) << 44;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*13+ 6) = (uint64_t)SRC(ip, i*16+ 7) >> 20 | (uint64_t)SRC1(ip, i*16+8) << 32;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*13+ 7) = (uint64_t)SRC(ip, i*16+ 8) >> 32 | (uint64_t)SRC1(ip, i*16+9) << 20;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*13+ 8) = (uint64_t)SRC(ip, i*16+ 9) >> 44;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*13+ 8) |= (uint64_t)SRC(ip, i*16+10) << 8 | (uint64_t)SRC1(ip, i*16+11) << 60;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*13+ 9) = (uint64_t)SRC(ip, i*16+11) >> 4 | (uint64_t)SRC1(ip, i*16+12) << 48;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*13+10) = (uint64_t)SRC(ip, i*16+12) >> 16 | (uint64_t)SRC1(ip, i*16+13) << 36;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*13+11) = (uint64_t)SRC(ip, i*16+13) >> 28 | (uint64_t)SRC1(ip, i*16+14) << 24;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*13+12) = (uint64_t)SRC(ip, i*16+14) >> 40;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*13+12) |= (uint64_t)SRC(ip, i*16+15) << 12;\ -} - -#define BITPACK64_52(ip, op, parm) { \ - BITBLK64_52(ip, 0, op, parm);\ - BITBLK64_52(ip, 1, op, parm); SRCI(ip); op += 52*4/sizeof(op[0]);\ -} - -#define BITBLK64_53(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*53+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 53;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*53+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 11 | (uint64_t)SRC1(ip, i*64+2) << 42;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*53+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 22 | (uint64_t)SRC1(ip, i*64+3) << 31;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*53+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 33 | (uint64_t)SRC1(ip, i*64+4) << 20;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*53+ 4) = (uint64_t)SRC(ip, i*64+ 4) >> 44;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*53+ 4) |= (uint64_t)SRC(ip, i*64+ 5) << 9 | (uint64_t)SRC1(ip, i*64+6) << 62;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*53+ 5) = (uint64_t)SRC(ip, i*64+ 6) >> 2 | (uint64_t)SRC1(ip, i*64+7) << 51;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*53+ 6) = (uint64_t)SRC(ip, i*64+ 7) >> 13 | (uint64_t)SRC1(ip, i*64+8) << 40;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*53+ 7) = (uint64_t)SRC(ip, i*64+ 8) >> 24 | (uint64_t)SRC1(ip, i*64+9) << 29;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*53+ 8) = (uint64_t)SRC(ip, i*64+ 9) >> 35 | (uint64_t)SRC1(ip, i*64+10) << 18;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*53+ 9) = (uint64_t)SRC(ip, i*64+10) >> 46;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*53+ 9) |= (uint64_t)SRC(ip, i*64+11) << 7 | (uint64_t)SRC1(ip, i*64+12) << 60;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*53+10) = (uint64_t)SRC(ip, i*64+12) >> 4 | (uint64_t)SRC1(ip, i*64+13) << 49;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*53+11) = (uint64_t)SRC(ip, i*64+13) >> 15 | (uint64_t)SRC1(ip, i*64+14) << 38;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*53+12) = (uint64_t)SRC(ip, i*64+14) >> 26 | (uint64_t)SRC1(ip, i*64+15) << 27;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*53+13) = (uint64_t)SRC(ip, i*64+15) >> 37 | (uint64_t)SRC1(ip, i*64+16) << 16;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*53+14) = (uint64_t)SRC(ip, i*64+16) >> 48;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*53+14) |= (uint64_t)SRC(ip, i*64+17) << 5 | (uint64_t)SRC1(ip, i*64+18) << 58;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*53+15) = (uint64_t)SRC(ip, i*64+18) >> 6 | (uint64_t)SRC1(ip, i*64+19) << 47;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*53+16) = (uint64_t)SRC(ip, i*64+19) >> 17 | (uint64_t)SRC1(ip, i*64+20) << 36;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*53+17) = (uint64_t)SRC(ip, i*64+20) >> 28 | (uint64_t)SRC1(ip, i*64+21) << 25;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*53+18) = (uint64_t)SRC(ip, i*64+21) >> 39 | (uint64_t)SRC1(ip, i*64+22) << 14;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*53+19) = (uint64_t)SRC(ip, i*64+22) >> 50;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*53+19) |= (uint64_t)SRC(ip, i*64+23) << 3 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*53+20) = (uint64_t)SRC(ip, i*64+24) >> 8 | (uint64_t)SRC1(ip, i*64+25) << 45;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*53+21) = (uint64_t)SRC(ip, i*64+25) >> 19 | (uint64_t)SRC1(ip, i*64+26) << 34;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*53+22) = (uint64_t)SRC(ip, i*64+26) >> 30 | (uint64_t)SRC1(ip, i*64+27) << 23;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*53+23) = (uint64_t)SRC(ip, i*64+27) >> 41 | (uint64_t)SRC1(ip, i*64+28) << 12;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*53+24) = (uint64_t)SRC(ip, i*64+28) >> 52;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*53+24) |= (uint64_t)SRC(ip, i*64+29) << 1 | (uint64_t)SRC1(ip, i*64+30) << 54;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*53+25) = (uint64_t)SRC(ip, i*64+30) >> 10 | (uint64_t)SRC1(ip, i*64+31) << 43;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*53+26) = (uint64_t)SRC(ip, i*64+31) >> 21;\ -} - -#define BITPACK64_53(ip, op, parm) { \ - BITBLK64_53(ip, 0, op, parm); SRCI(ip); op += 53*4/sizeof(op[0]);\ -} - -#define BITBLK64_54(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*27+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 54;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*27+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 10 | (uint64_t)SRC1(ip, i*32+2) << 44;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*27+ 2) = (uint64_t)SRC(ip, i*32+ 2) >> 20 | (uint64_t)SRC1(ip, i*32+3) << 34;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*27+ 3) = (uint64_t)SRC(ip, i*32+ 3) >> 30 | (uint64_t)SRC1(ip, i*32+4) << 24;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*27+ 4) = (uint64_t)SRC(ip, i*32+ 4) >> 40 | (uint64_t)SRC1(ip, i*32+5) << 14;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*27+ 5) = (uint64_t)SRC(ip, i*32+ 5) >> 50;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*27+ 5) |= (uint64_t)SRC(ip, i*32+ 6) << 4 | (uint64_t)SRC1(ip, i*32+7) << 58;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*27+ 6) = (uint64_t)SRC(ip, i*32+ 7) >> 6 | (uint64_t)SRC1(ip, i*32+8) << 48;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*27+ 7) = (uint64_t)SRC(ip, i*32+ 8) >> 16 | (uint64_t)SRC1(ip, i*32+9) << 38;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*27+ 8) = (uint64_t)SRC(ip, i*32+ 9) >> 26 | (uint64_t)SRC1(ip, i*32+10) << 28;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*27+ 9) = (uint64_t)SRC(ip, i*32+10) >> 36 | (uint64_t)SRC1(ip, i*32+11) << 18;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*27+10) = (uint64_t)SRC(ip, i*32+11) >> 46;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*27+10) |= (uint64_t)SRC(ip, i*32+12) << 8 | (uint64_t)SRC1(ip, i*32+13) << 62;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*27+11) = (uint64_t)SRC(ip, i*32+13) >> 2 | (uint64_t)SRC1(ip, i*32+14) << 52;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*27+12) = (uint64_t)SRC(ip, i*32+14) >> 12 | (uint64_t)SRC1(ip, i*32+15) << 42;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*27+13) = (uint64_t)SRC(ip, i*32+15) >> 22 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*27+14) = (uint64_t)SRC(ip, i*32+16) >> 32 | (uint64_t)SRC1(ip, i*32+17) << 22;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*27+15) = (uint64_t)SRC(ip, i*32+17) >> 42 | (uint64_t)SRC1(ip, i*32+18) << 12;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*27+16) = (uint64_t)SRC(ip, i*32+18) >> 52;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*27+16) |= (uint64_t)SRC(ip, i*32+19) << 2 | (uint64_t)SRC1(ip, i*32+20) << 56;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*27+17) = (uint64_t)SRC(ip, i*32+20) >> 8 | (uint64_t)SRC1(ip, i*32+21) << 46;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*27+18) = (uint64_t)SRC(ip, i*32+21) >> 18 | (uint64_t)SRC1(ip, i*32+22) << 36;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*27+19) = (uint64_t)SRC(ip, i*32+22) >> 28 | (uint64_t)SRC1(ip, i*32+23) << 26;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*27+20) = (uint64_t)SRC(ip, i*32+23) >> 38 | (uint64_t)SRC1(ip, i*32+24) << 16;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*27+21) = (uint64_t)SRC(ip, i*32+24) >> 48;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*27+21) |= (uint64_t)SRC(ip, i*32+25) << 6 | (uint64_t)SRC1(ip, i*32+26) << 60;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*27+22) = (uint64_t)SRC(ip, i*32+26) >> 4 | (uint64_t)SRC1(ip, i*32+27) << 50;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*27+23) = (uint64_t)SRC(ip, i*32+27) >> 14 | (uint64_t)SRC1(ip, i*32+28) << 40;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*27+24) = (uint64_t)SRC(ip, i*32+28) >> 24 | (uint64_t)SRC1(ip, i*32+29) << 30;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*27+25) = (uint64_t)SRC(ip, i*32+29) >> 34 | (uint64_t)SRC1(ip, i*32+30) << 20;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*27+26) = (uint64_t)SRC(ip, i*32+30) >> 44;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*27+26) |= (uint64_t)SRC(ip, i*32+31) << 10;\ -} - -#define BITPACK64_54(ip, op, parm) { \ - BITBLK64_54(ip, 0, op, parm); SRCI(ip); op += 54*4/sizeof(op[0]);\ -} - -#define BITBLK64_55(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*55+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 55;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*55+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 9 | (uint64_t)SRC1(ip, i*64+2) << 46;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*55+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 18 | (uint64_t)SRC1(ip, i*64+3) << 37;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*55+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 27 | (uint64_t)SRC1(ip, i*64+4) << 28;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*55+ 4) = (uint64_t)SRC(ip, i*64+ 4) >> 36 | (uint64_t)SRC1(ip, i*64+5) << 19;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*55+ 5) = (uint64_t)SRC(ip, i*64+ 5) >> 45 | (uint64_t)SRC1(ip, i*64+6) << 10;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*55+ 6) = (uint64_t)SRC(ip, i*64+ 6) >> 54;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*55+ 6) |= (uint64_t)SRC(ip, i*64+ 7) << 1 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*55+ 7) = (uint64_t)SRC(ip, i*64+ 8) >> 8 | (uint64_t)SRC1(ip, i*64+9) << 47;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*55+ 8) = (uint64_t)SRC(ip, i*64+ 9) >> 17 | (uint64_t)SRC1(ip, i*64+10) << 38;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*55+ 9) = (uint64_t)SRC(ip, i*64+10) >> 26 | (uint64_t)SRC1(ip, i*64+11) << 29;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*55+10) = (uint64_t)SRC(ip, i*64+11) >> 35 | (uint64_t)SRC1(ip, i*64+12) << 20;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*55+11) = (uint64_t)SRC(ip, i*64+12) >> 44 | (uint64_t)SRC1(ip, i*64+13) << 11;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*55+12) = (uint64_t)SRC(ip, i*64+13) >> 53;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*55+12) |= (uint64_t)SRC(ip, i*64+14) << 2 | (uint64_t)SRC1(ip, i*64+15) << 57;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*55+13) = (uint64_t)SRC(ip, i*64+15) >> 7 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*55+14) = (uint64_t)SRC(ip, i*64+16) >> 16 | (uint64_t)SRC1(ip, i*64+17) << 39;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*55+15) = (uint64_t)SRC(ip, i*64+17) >> 25 | (uint64_t)SRC1(ip, i*64+18) << 30;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*55+16) = (uint64_t)SRC(ip, i*64+18) >> 34 | (uint64_t)SRC1(ip, i*64+19) << 21;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*55+17) = (uint64_t)SRC(ip, i*64+19) >> 43 | (uint64_t)SRC1(ip, i*64+20) << 12;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*55+18) = (uint64_t)SRC(ip, i*64+20) >> 52;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*55+18) |= (uint64_t)SRC(ip, i*64+21) << 3 | (uint64_t)SRC1(ip, i*64+22) << 58;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*55+19) = (uint64_t)SRC(ip, i*64+22) >> 6 | (uint64_t)SRC1(ip, i*64+23) << 49;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*55+20) = (uint64_t)SRC(ip, i*64+23) >> 15 | (uint64_t)SRC1(ip, i*64+24) << 40;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*55+21) = (uint64_t)SRC(ip, i*64+24) >> 24 | (uint64_t)SRC1(ip, i*64+25) << 31;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*55+22) = (uint64_t)SRC(ip, i*64+25) >> 33 | (uint64_t)SRC1(ip, i*64+26) << 22;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*55+23) = (uint64_t)SRC(ip, i*64+26) >> 42 | (uint64_t)SRC1(ip, i*64+27) << 13;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*55+24) = (uint64_t)SRC(ip, i*64+27) >> 51;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*55+24) |= (uint64_t)SRC(ip, i*64+28) << 4 | (uint64_t)SRC1(ip, i*64+29) << 59;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*55+25) = (uint64_t)SRC(ip, i*64+29) >> 5 | (uint64_t)SRC1(ip, i*64+30) << 50;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*55+26) = (uint64_t)SRC(ip, i*64+30) >> 14 | (uint64_t)SRC1(ip, i*64+31) << 41;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*55+27) = (uint64_t)SRC(ip, i*64+31) >> 23;\ -} - -#define BITPACK64_55(ip, op, parm) { \ - BITBLK64_55(ip, 0, op, parm); SRCI(ip); op += 55*4/sizeof(op[0]);\ -} - -#define BITBLK64_56(ip, i, op, parm) { ;\ - IPPB(ip, i*8+ 0, parm); *((uint64_t *)op+i*7+ 0) = (uint64_t)SRC(ip, i*8+ 0) | (uint64_t)SRC1(ip, i*8+1) << 56;\ - IPPB(ip, i*8+ 1, parm); *((uint64_t *)op+i*7+ 1) = (uint64_t)SRC(ip, i*8+ 1) >> 8 | (uint64_t)SRC1(ip, i*8+2) << 48;\ - IPPB(ip, i*8+ 2, parm); *((uint64_t *)op+i*7+ 2) = (uint64_t)SRC(ip, i*8+ 2) >> 16 | (uint64_t)SRC1(ip, i*8+3) << 40;\ - IPPB(ip, i*8+ 3, parm); *((uint64_t *)op+i*7+ 3) = (uint64_t)SRC(ip, i*8+ 3) >> 24 | (uint64_t)SRC1(ip, i*8+4) << 32;\ - IPPB(ip, i*8+ 4, parm); *((uint64_t *)op+i*7+ 4) = (uint64_t)SRC(ip, i*8+ 4) >> 32 | (uint64_t)SRC1(ip, i*8+5) << 24;\ - IPPB(ip, i*8+ 5, parm); *((uint64_t *)op+i*7+ 5) = (uint64_t)SRC(ip, i*8+ 5) >> 40 | (uint64_t)SRC1(ip, i*8+6) << 16;\ - IPPB(ip, i*8+ 6, parm); *((uint64_t *)op+i*7+ 6) = (uint64_t)SRC(ip, i*8+ 6) >> 48;\ - IPPB(ip, i*8+ 7, parm); *((uint64_t *)op+i*7+ 6) |= (uint64_t)SRC(ip, i*8+ 7) << 8;\ -} - -#define BITPACK64_56(ip, op, parm) { \ - BITBLK64_56(ip, 0, op, parm);\ - BITBLK64_56(ip, 1, op, parm);\ - BITBLK64_56(ip, 2, op, parm);\ - BITBLK64_56(ip, 3, op, parm); SRCI(ip); op += 56*4/sizeof(op[0]);\ -} - -#define BITBLK64_57(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*57+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 57;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*57+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 7 | (uint64_t)SRC1(ip, i*64+2) << 50;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*57+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 14 | (uint64_t)SRC1(ip, i*64+3) << 43;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*57+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 21 | (uint64_t)SRC1(ip, i*64+4) << 36;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*57+ 4) = (uint64_t)SRC(ip, i*64+ 4) >> 28 | (uint64_t)SRC1(ip, i*64+5) << 29;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*57+ 5) = (uint64_t)SRC(ip, i*64+ 5) >> 35 | (uint64_t)SRC1(ip, i*64+6) << 22;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*57+ 6) = (uint64_t)SRC(ip, i*64+ 6) >> 42 | (uint64_t)SRC1(ip, i*64+7) << 15;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*57+ 7) = (uint64_t)SRC(ip, i*64+ 7) >> 49 | (uint64_t)SRC1(ip, i*64+8) << 8;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*57+ 8) = (uint64_t)SRC(ip, i*64+ 8) >> 56;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*57+ 8) |= (uint64_t)SRC(ip, i*64+ 9) << 1 | (uint64_t)SRC1(ip, i*64+10) << 58;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*57+ 9) = (uint64_t)SRC(ip, i*64+10) >> 6 | (uint64_t)SRC1(ip, i*64+11) << 51;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*57+10) = (uint64_t)SRC(ip, i*64+11) >> 13 | (uint64_t)SRC1(ip, i*64+12) << 44;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*57+11) = (uint64_t)SRC(ip, i*64+12) >> 20 | (uint64_t)SRC1(ip, i*64+13) << 37;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*57+12) = (uint64_t)SRC(ip, i*64+13) >> 27 | (uint64_t)SRC1(ip, i*64+14) << 30;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*57+13) = (uint64_t)SRC(ip, i*64+14) >> 34 | (uint64_t)SRC1(ip, i*64+15) << 23;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*57+14) = (uint64_t)SRC(ip, i*64+15) >> 41 | (uint64_t)SRC1(ip, i*64+16) << 16;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*57+15) = (uint64_t)SRC(ip, i*64+16) >> 48 | (uint64_t)SRC1(ip, i*64+17) << 9;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*57+16) = (uint64_t)SRC(ip, i*64+17) >> 55;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*57+16) |= (uint64_t)SRC(ip, i*64+18) << 2 | (uint64_t)SRC1(ip, i*64+19) << 59;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*57+17) = (uint64_t)SRC(ip, i*64+19) >> 5 | (uint64_t)SRC1(ip, i*64+20) << 52;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*57+18) = (uint64_t)SRC(ip, i*64+20) >> 12 | (uint64_t)SRC1(ip, i*64+21) << 45;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*57+19) = (uint64_t)SRC(ip, i*64+21) >> 19 | (uint64_t)SRC1(ip, i*64+22) << 38;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*57+20) = (uint64_t)SRC(ip, i*64+22) >> 26 | (uint64_t)SRC1(ip, i*64+23) << 31;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*57+21) = (uint64_t)SRC(ip, i*64+23) >> 33 | (uint64_t)SRC1(ip, i*64+24) << 24;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*57+22) = (uint64_t)SRC(ip, i*64+24) >> 40 | (uint64_t)SRC1(ip, i*64+25) << 17;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*57+23) = (uint64_t)SRC(ip, i*64+25) >> 47 | (uint64_t)SRC1(ip, i*64+26) << 10;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*57+24) = (uint64_t)SRC(ip, i*64+26) >> 54;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*57+24) |= (uint64_t)SRC(ip, i*64+27) << 3 | (uint64_t)SRC1(ip, i*64+28) << 60;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*57+25) = (uint64_t)SRC(ip, i*64+28) >> 4 | (uint64_t)SRC1(ip, i*64+29) << 53;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*57+26) = (uint64_t)SRC(ip, i*64+29) >> 11 | (uint64_t)SRC1(ip, i*64+30) << 46;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*57+27) = (uint64_t)SRC(ip, i*64+30) >> 18 | (uint64_t)SRC1(ip, i*64+31) << 39;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*57+28) = (uint64_t)SRC(ip, i*64+31) >> 25;\ -} - -#define BITPACK64_57(ip, op, parm) { \ - BITBLK64_57(ip, 0, op, parm); SRCI(ip); op += 57*4/sizeof(op[0]);\ -} - -#define BITBLK64_58(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*29+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 58;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*29+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 6 | (uint64_t)SRC1(ip, i*32+2) << 52;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*29+ 2) = (uint64_t)SRC(ip, i*32+ 2) >> 12 | (uint64_t)SRC1(ip, i*32+3) << 46;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*29+ 3) = (uint64_t)SRC(ip, i*32+ 3) >> 18 | (uint64_t)SRC1(ip, i*32+4) << 40;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*29+ 4) = (uint64_t)SRC(ip, i*32+ 4) >> 24 | (uint64_t)SRC1(ip, i*32+5) << 34;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*29+ 5) = (uint64_t)SRC(ip, i*32+ 5) >> 30 | (uint64_t)SRC1(ip, i*32+6) << 28;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*29+ 6) = (uint64_t)SRC(ip, i*32+ 6) >> 36 | (uint64_t)SRC1(ip, i*32+7) << 22;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*29+ 7) = (uint64_t)SRC(ip, i*32+ 7) >> 42 | (uint64_t)SRC1(ip, i*32+8) << 16;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*29+ 8) = (uint64_t)SRC(ip, i*32+ 8) >> 48 | (uint64_t)SRC1(ip, i*32+9) << 10;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*29+ 9) = (uint64_t)SRC(ip, i*32+ 9) >> 54;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*29+ 9) |= (uint64_t)SRC(ip, i*32+10) << 4 | (uint64_t)SRC1(ip, i*32+11) << 62;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*29+10) = (uint64_t)SRC(ip, i*32+11) >> 2 | (uint64_t)SRC1(ip, i*32+12) << 56;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*29+11) = (uint64_t)SRC(ip, i*32+12) >> 8 | (uint64_t)SRC1(ip, i*32+13) << 50;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*29+12) = (uint64_t)SRC(ip, i*32+13) >> 14 | (uint64_t)SRC1(ip, i*32+14) << 44;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*29+13) = (uint64_t)SRC(ip, i*32+14) >> 20 | (uint64_t)SRC1(ip, i*32+15) << 38;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*29+14) = (uint64_t)SRC(ip, i*32+15) >> 26 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*29+15) = (uint64_t)SRC(ip, i*32+16) >> 32 | (uint64_t)SRC1(ip, i*32+17) << 26;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*29+16) = (uint64_t)SRC(ip, i*32+17) >> 38 | (uint64_t)SRC1(ip, i*32+18) << 20;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*29+17) = (uint64_t)SRC(ip, i*32+18) >> 44 | (uint64_t)SRC1(ip, i*32+19) << 14;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*29+18) = (uint64_t)SRC(ip, i*32+19) >> 50 | (uint64_t)SRC1(ip, i*32+20) << 8;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*29+19) = (uint64_t)SRC(ip, i*32+20) >> 56;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*29+19) |= (uint64_t)SRC(ip, i*32+21) << 2 | (uint64_t)SRC1(ip, i*32+22) << 60;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*29+20) = (uint64_t)SRC(ip, i*32+22) >> 4 | (uint64_t)SRC1(ip, i*32+23) << 54;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*29+21) = (uint64_t)SRC(ip, i*32+23) >> 10 | (uint64_t)SRC1(ip, i*32+24) << 48;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*29+22) = (uint64_t)SRC(ip, i*32+24) >> 16 | (uint64_t)SRC1(ip, i*32+25) << 42;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*29+23) = (uint64_t)SRC(ip, i*32+25) >> 22 | (uint64_t)SRC1(ip, i*32+26) << 36;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*29+24) = (uint64_t)SRC(ip, i*32+26) >> 28 | (uint64_t)SRC1(ip, i*32+27) << 30;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*29+25) = (uint64_t)SRC(ip, i*32+27) >> 34 | (uint64_t)SRC1(ip, i*32+28) << 24;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*29+26) = (uint64_t)SRC(ip, i*32+28) >> 40 | (uint64_t)SRC1(ip, i*32+29) << 18;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*29+27) = (uint64_t)SRC(ip, i*32+29) >> 46 | (uint64_t)SRC1(ip, i*32+30) << 12;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*29+28) = (uint64_t)SRC(ip, i*32+30) >> 52;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*29+28) |= (uint64_t)SRC(ip, i*32+31) << 6;\ -} - -#define BITPACK64_58(ip, op, parm) { \ - BITBLK64_58(ip, 0, op, parm); SRCI(ip); op += 58*4/sizeof(op[0]);\ -} - -#define BITBLK64_59(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*59+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 59;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*59+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 5 | (uint64_t)SRC1(ip, i*64+2) << 54;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*59+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 10 | (uint64_t)SRC1(ip, i*64+3) << 49;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*59+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 15 | (uint64_t)SRC1(ip, i*64+4) << 44;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*59+ 4) = (uint64_t)SRC(ip, i*64+ 4) >> 20 | (uint64_t)SRC1(ip, i*64+5) << 39;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*59+ 5) = (uint64_t)SRC(ip, i*64+ 5) >> 25 | (uint64_t)SRC1(ip, i*64+6) << 34;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*59+ 6) = (uint64_t)SRC(ip, i*64+ 6) >> 30 | (uint64_t)SRC1(ip, i*64+7) << 29;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*59+ 7) = (uint64_t)SRC(ip, i*64+ 7) >> 35 | (uint64_t)SRC1(ip, i*64+8) << 24;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*59+ 8) = (uint64_t)SRC(ip, i*64+ 8) >> 40 | (uint64_t)SRC1(ip, i*64+9) << 19;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*59+ 9) = (uint64_t)SRC(ip, i*64+ 9) >> 45 | (uint64_t)SRC1(ip, i*64+10) << 14;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*59+10) = (uint64_t)SRC(ip, i*64+10) >> 50 | (uint64_t)SRC1(ip, i*64+11) << 9;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*59+11) = (uint64_t)SRC(ip, i*64+11) >> 55;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*59+11) |= (uint64_t)SRC(ip, i*64+12) << 4 | (uint64_t)SRC1(ip, i*64+13) << 63;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*59+12) = (uint64_t)SRC(ip, i*64+13) >> 1 | (uint64_t)SRC1(ip, i*64+14) << 58;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*59+13) = (uint64_t)SRC(ip, i*64+14) >> 6 | (uint64_t)SRC1(ip, i*64+15) << 53;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*59+14) = (uint64_t)SRC(ip, i*64+15) >> 11 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*59+15) = (uint64_t)SRC(ip, i*64+16) >> 16 | (uint64_t)SRC1(ip, i*64+17) << 43;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*59+16) = (uint64_t)SRC(ip, i*64+17) >> 21 | (uint64_t)SRC1(ip, i*64+18) << 38;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*59+17) = (uint64_t)SRC(ip, i*64+18) >> 26 | (uint64_t)SRC1(ip, i*64+19) << 33;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*59+18) = (uint64_t)SRC(ip, i*64+19) >> 31 | (uint64_t)SRC1(ip, i*64+20) << 28;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*59+19) = (uint64_t)SRC(ip, i*64+20) >> 36 | (uint64_t)SRC1(ip, i*64+21) << 23;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*59+20) = (uint64_t)SRC(ip, i*64+21) >> 41 | (uint64_t)SRC1(ip, i*64+22) << 18;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*59+21) = (uint64_t)SRC(ip, i*64+22) >> 46 | (uint64_t)SRC1(ip, i*64+23) << 13;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*59+22) = (uint64_t)SRC(ip, i*64+23) >> 51 | (uint64_t)SRC1(ip, i*64+24) << 8;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*59+23) = (uint64_t)SRC(ip, i*64+24) >> 56;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*59+23) |= (uint64_t)SRC(ip, i*64+25) << 3 | (uint64_t)SRC1(ip, i*64+26) << 62;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*59+24) = (uint64_t)SRC(ip, i*64+26) >> 2 | (uint64_t)SRC1(ip, i*64+27) << 57;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*59+25) = (uint64_t)SRC(ip, i*64+27) >> 7 | (uint64_t)SRC1(ip, i*64+28) << 52;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*59+26) = (uint64_t)SRC(ip, i*64+28) >> 12 | (uint64_t)SRC1(ip, i*64+29) << 47;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*59+27) = (uint64_t)SRC(ip, i*64+29) >> 17 | (uint64_t)SRC1(ip, i*64+30) << 42;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*59+28) = (uint64_t)SRC(ip, i*64+30) >> 22 | (uint64_t)SRC1(ip, i*64+31) << 37;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*59+29) = (uint64_t)SRC(ip, i*64+31) >> 27;\ -} - -#define BITPACK64_59(ip, op, parm) { \ - BITBLK64_59(ip, 0, op, parm); SRCI(ip); op += 59*4/sizeof(op[0]);\ -} - -#define BITBLK64_60(ip, i, op, parm) { ;\ - IPPB(ip, i*16+ 0, parm); *((uint64_t *)op+i*15+ 0) = (uint64_t)SRC(ip, i*16+ 0) | (uint64_t)SRC1(ip, i*16+1) << 60;\ - IPPB(ip, i*16+ 1, parm); *((uint64_t *)op+i*15+ 1) = (uint64_t)SRC(ip, i*16+ 1) >> 4 | (uint64_t)SRC1(ip, i*16+2) << 56;\ - IPPB(ip, i*16+ 2, parm); *((uint64_t *)op+i*15+ 2) = (uint64_t)SRC(ip, i*16+ 2) >> 8 | (uint64_t)SRC1(ip, i*16+3) << 52;\ - IPPB(ip, i*16+ 3, parm); *((uint64_t *)op+i*15+ 3) = (uint64_t)SRC(ip, i*16+ 3) >> 12 | (uint64_t)SRC1(ip, i*16+4) << 48;\ - IPPB(ip, i*16+ 4, parm); *((uint64_t *)op+i*15+ 4) = (uint64_t)SRC(ip, i*16+ 4) >> 16 | (uint64_t)SRC1(ip, i*16+5) << 44;\ - IPPB(ip, i*16+ 5, parm); *((uint64_t *)op+i*15+ 5) = (uint64_t)SRC(ip, i*16+ 5) >> 20 | (uint64_t)SRC1(ip, i*16+6) << 40;\ - IPPB(ip, i*16+ 6, parm); *((uint64_t *)op+i*15+ 6) = (uint64_t)SRC(ip, i*16+ 6) >> 24 | (uint64_t)SRC1(ip, i*16+7) << 36;\ - IPPB(ip, i*16+ 7, parm); *((uint64_t *)op+i*15+ 7) = (uint64_t)SRC(ip, i*16+ 7) >> 28 | (uint64_t)SRC1(ip, i*16+8) << 32;\ - IPPB(ip, i*16+ 8, parm); *((uint64_t *)op+i*15+ 8) = (uint64_t)SRC(ip, i*16+ 8) >> 32 | (uint64_t)SRC1(ip, i*16+9) << 28;\ - IPPB(ip, i*16+ 9, parm); *((uint64_t *)op+i*15+ 9) = (uint64_t)SRC(ip, i*16+ 9) >> 36 | (uint64_t)SRC1(ip, i*16+10) << 24;\ - IPPB(ip, i*16+10, parm); *((uint64_t *)op+i*15+10) = (uint64_t)SRC(ip, i*16+10) >> 40 | (uint64_t)SRC1(ip, i*16+11) << 20;\ - IPPB(ip, i*16+11, parm); *((uint64_t *)op+i*15+11) = (uint64_t)SRC(ip, i*16+11) >> 44 | (uint64_t)SRC1(ip, i*16+12) << 16;\ - IPPB(ip, i*16+12, parm); *((uint64_t *)op+i*15+12) = (uint64_t)SRC(ip, i*16+12) >> 48 | (uint64_t)SRC1(ip, i*16+13) << 12;\ - IPPB(ip, i*16+13, parm); *((uint64_t *)op+i*15+13) = (uint64_t)SRC(ip, i*16+13) >> 52 | (uint64_t)SRC1(ip, i*16+14) << 8;\ - IPPB(ip, i*16+14, parm); *((uint64_t *)op+i*15+14) = (uint64_t)SRC(ip, i*16+14) >> 56;\ - IPPB(ip, i*16+15, parm); *((uint64_t *)op+i*15+14) |= (uint64_t)SRC(ip, i*16+15) << 4;\ -} - -#define BITPACK64_60(ip, op, parm) { \ - BITBLK64_60(ip, 0, op, parm);\ - BITBLK64_60(ip, 1, op, parm); SRCI(ip); op += 60*4/sizeof(op[0]);\ -} - -#define BITBLK64_61(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*61+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 61;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*61+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 3 | (uint64_t)SRC1(ip, i*64+2) << 58;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*61+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 6 | (uint64_t)SRC1(ip, i*64+3) << 55;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*61+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 9 | (uint64_t)SRC1(ip, i*64+4) << 52;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*61+ 4) = (uint64_t)SRC(ip, i*64+ 4) >> 12 | (uint64_t)SRC1(ip, i*64+5) << 49;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*61+ 5) = (uint64_t)SRC(ip, i*64+ 5) >> 15 | (uint64_t)SRC1(ip, i*64+6) << 46;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*61+ 6) = (uint64_t)SRC(ip, i*64+ 6) >> 18 | (uint64_t)SRC1(ip, i*64+7) << 43;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*61+ 7) = (uint64_t)SRC(ip, i*64+ 7) >> 21 | (uint64_t)SRC1(ip, i*64+8) << 40;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*61+ 8) = (uint64_t)SRC(ip, i*64+ 8) >> 24 | (uint64_t)SRC1(ip, i*64+9) << 37;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*61+ 9) = (uint64_t)SRC(ip, i*64+ 9) >> 27 | (uint64_t)SRC1(ip, i*64+10) << 34;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*61+10) = (uint64_t)SRC(ip, i*64+10) >> 30 | (uint64_t)SRC1(ip, i*64+11) << 31;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*61+11) = (uint64_t)SRC(ip, i*64+11) >> 33 | (uint64_t)SRC1(ip, i*64+12) << 28;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*61+12) = (uint64_t)SRC(ip, i*64+12) >> 36 | (uint64_t)SRC1(ip, i*64+13) << 25;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*61+13) = (uint64_t)SRC(ip, i*64+13) >> 39 | (uint64_t)SRC1(ip, i*64+14) << 22;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*61+14) = (uint64_t)SRC(ip, i*64+14) >> 42 | (uint64_t)SRC1(ip, i*64+15) << 19;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*61+15) = (uint64_t)SRC(ip, i*64+15) >> 45 | (uint64_t)SRC1(ip, i*64+16) << 16;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*61+16) = (uint64_t)SRC(ip, i*64+16) >> 48 | (uint64_t)SRC1(ip, i*64+17) << 13;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*61+17) = (uint64_t)SRC(ip, i*64+17) >> 51 | (uint64_t)SRC1(ip, i*64+18) << 10;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*61+18) = (uint64_t)SRC(ip, i*64+18) >> 54 | (uint64_t)SRC1(ip, i*64+19) << 7;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*61+19) = (uint64_t)SRC(ip, i*64+19) >> 57 | (uint64_t)SRC1(ip, i*64+20) << 4;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*61+20) = (uint64_t)SRC(ip, i*64+20) >> 60;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*61+20) |= (uint64_t)SRC(ip, i*64+21) << 1 | (uint64_t)SRC1(ip, i*64+22) << 62;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*61+21) = (uint64_t)SRC(ip, i*64+22) >> 2 | (uint64_t)SRC1(ip, i*64+23) << 59;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*61+22) = (uint64_t)SRC(ip, i*64+23) >> 5 | (uint64_t)SRC1(ip, i*64+24) << 56;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*61+23) = (uint64_t)SRC(ip, i*64+24) >> 8 | (uint64_t)SRC1(ip, i*64+25) << 53;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*61+24) = (uint64_t)SRC(ip, i*64+25) >> 11 | (uint64_t)SRC1(ip, i*64+26) << 50;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*61+25) = (uint64_t)SRC(ip, i*64+26) >> 14 | (uint64_t)SRC1(ip, i*64+27) << 47;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*61+26) = (uint64_t)SRC(ip, i*64+27) >> 17 | (uint64_t)SRC1(ip, i*64+28) << 44;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*61+27) = (uint64_t)SRC(ip, i*64+28) >> 20 | (uint64_t)SRC1(ip, i*64+29) << 41;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*61+28) = (uint64_t)SRC(ip, i*64+29) >> 23 | (uint64_t)SRC1(ip, i*64+30) << 38;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*61+29) = (uint64_t)SRC(ip, i*64+30) >> 26 | (uint64_t)SRC1(ip, i*64+31) << 35;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*61+30) = (uint64_t)SRC(ip, i*64+31) >> 29;\ -} - -#define BITPACK64_61(ip, op, parm) { \ - BITBLK64_61(ip, 0, op, parm); SRCI(ip); op += 61*4/sizeof(op[0]);\ -} - -#define BITBLK64_62(ip, i, op, parm) { ;\ - IPPB(ip, i*32+ 0, parm); *((uint64_t *)op+i*31+ 0) = (uint64_t)SRC(ip, i*32+ 0) | (uint64_t)SRC1(ip, i*32+1) << 62;\ - IPPB(ip, i*32+ 1, parm); *((uint64_t *)op+i*31+ 1) = (uint64_t)SRC(ip, i*32+ 1) >> 2 | (uint64_t)SRC1(ip, i*32+2) << 60;\ - IPPB(ip, i*32+ 2, parm); *((uint64_t *)op+i*31+ 2) = (uint64_t)SRC(ip, i*32+ 2) >> 4 | (uint64_t)SRC1(ip, i*32+3) << 58;\ - IPPB(ip, i*32+ 3, parm); *((uint64_t *)op+i*31+ 3) = (uint64_t)SRC(ip, i*32+ 3) >> 6 | (uint64_t)SRC1(ip, i*32+4) << 56;\ - IPPB(ip, i*32+ 4, parm); *((uint64_t *)op+i*31+ 4) = (uint64_t)SRC(ip, i*32+ 4) >> 8 | (uint64_t)SRC1(ip, i*32+5) << 54;\ - IPPB(ip, i*32+ 5, parm); *((uint64_t *)op+i*31+ 5) = (uint64_t)SRC(ip, i*32+ 5) >> 10 | (uint64_t)SRC1(ip, i*32+6) << 52;\ - IPPB(ip, i*32+ 6, parm); *((uint64_t *)op+i*31+ 6) = (uint64_t)SRC(ip, i*32+ 6) >> 12 | (uint64_t)SRC1(ip, i*32+7) << 50;\ - IPPB(ip, i*32+ 7, parm); *((uint64_t *)op+i*31+ 7) = (uint64_t)SRC(ip, i*32+ 7) >> 14 | (uint64_t)SRC1(ip, i*32+8) << 48;\ - IPPB(ip, i*32+ 8, parm); *((uint64_t *)op+i*31+ 8) = (uint64_t)SRC(ip, i*32+ 8) >> 16 | (uint64_t)SRC1(ip, i*32+9) << 46;\ - IPPB(ip, i*32+ 9, parm); *((uint64_t *)op+i*31+ 9) = (uint64_t)SRC(ip, i*32+ 9) >> 18 | (uint64_t)SRC1(ip, i*32+10) << 44;\ - IPPB(ip, i*32+10, parm); *((uint64_t *)op+i*31+10) = (uint64_t)SRC(ip, i*32+10) >> 20 | (uint64_t)SRC1(ip, i*32+11) << 42;\ - IPPB(ip, i*32+11, parm); *((uint64_t *)op+i*31+11) = (uint64_t)SRC(ip, i*32+11) >> 22 | (uint64_t)SRC1(ip, i*32+12) << 40;\ - IPPB(ip, i*32+12, parm); *((uint64_t *)op+i*31+12) = (uint64_t)SRC(ip, i*32+12) >> 24 | (uint64_t)SRC1(ip, i*32+13) << 38;\ - IPPB(ip, i*32+13, parm); *((uint64_t *)op+i*31+13) = (uint64_t)SRC(ip, i*32+13) >> 26 | (uint64_t)SRC1(ip, i*32+14) << 36;\ - IPPB(ip, i*32+14, parm); *((uint64_t *)op+i*31+14) = (uint64_t)SRC(ip, i*32+14) >> 28 | (uint64_t)SRC1(ip, i*32+15) << 34;\ - IPPB(ip, i*32+15, parm); *((uint64_t *)op+i*31+15) = (uint64_t)SRC(ip, i*32+15) >> 30 | (uint64_t)SRC1(ip, i*32+16) << 32;\ - IPPB(ip, i*32+16, parm); *((uint64_t *)op+i*31+16) = (uint64_t)SRC(ip, i*32+16) >> 32 | (uint64_t)SRC1(ip, i*32+17) << 30;\ - IPPB(ip, i*32+17, parm); *((uint64_t *)op+i*31+17) = (uint64_t)SRC(ip, i*32+17) >> 34 | (uint64_t)SRC1(ip, i*32+18) << 28;\ - IPPB(ip, i*32+18, parm); *((uint64_t *)op+i*31+18) = (uint64_t)SRC(ip, i*32+18) >> 36 | (uint64_t)SRC1(ip, i*32+19) << 26;\ - IPPB(ip, i*32+19, parm); *((uint64_t *)op+i*31+19) = (uint64_t)SRC(ip, i*32+19) >> 38 | (uint64_t)SRC1(ip, i*32+20) << 24;\ - IPPB(ip, i*32+20, parm); *((uint64_t *)op+i*31+20) = (uint64_t)SRC(ip, i*32+20) >> 40 | (uint64_t)SRC1(ip, i*32+21) << 22;\ - IPPB(ip, i*32+21, parm); *((uint64_t *)op+i*31+21) = (uint64_t)SRC(ip, i*32+21) >> 42 | (uint64_t)SRC1(ip, i*32+22) << 20;\ - IPPB(ip, i*32+22, parm); *((uint64_t *)op+i*31+22) = (uint64_t)SRC(ip, i*32+22) >> 44 | (uint64_t)SRC1(ip, i*32+23) << 18;\ - IPPB(ip, i*32+23, parm); *((uint64_t *)op+i*31+23) = (uint64_t)SRC(ip, i*32+23) >> 46 | (uint64_t)SRC1(ip, i*32+24) << 16;\ - IPPB(ip, i*32+24, parm); *((uint64_t *)op+i*31+24) = (uint64_t)SRC(ip, i*32+24) >> 48 | (uint64_t)SRC1(ip, i*32+25) << 14;\ - IPPB(ip, i*32+25, parm); *((uint64_t *)op+i*31+25) = (uint64_t)SRC(ip, i*32+25) >> 50 | (uint64_t)SRC1(ip, i*32+26) << 12;\ - IPPB(ip, i*32+26, parm); *((uint64_t *)op+i*31+26) = (uint64_t)SRC(ip, i*32+26) >> 52 | (uint64_t)SRC1(ip, i*32+27) << 10;\ - IPPB(ip, i*32+27, parm); *((uint64_t *)op+i*31+27) = (uint64_t)SRC(ip, i*32+27) >> 54 | (uint64_t)SRC1(ip, i*32+28) << 8;\ - IPPB(ip, i*32+28, parm); *((uint64_t *)op+i*31+28) = (uint64_t)SRC(ip, i*32+28) >> 56 | (uint64_t)SRC1(ip, i*32+29) << 6;\ - IPPB(ip, i*32+29, parm); *((uint64_t *)op+i*31+29) = (uint64_t)SRC(ip, i*32+29) >> 58 | (uint64_t)SRC1(ip, i*32+30) << 4;\ - IPPB(ip, i*32+30, parm); *((uint64_t *)op+i*31+30) = (uint64_t)SRC(ip, i*32+30) >> 60;\ - IPPB(ip, i*32+31, parm); *((uint64_t *)op+i*31+30) |= (uint64_t)SRC(ip, i*32+31) << 2;\ -} - -#define BITPACK64_62(ip, op, parm) { \ - BITBLK64_62(ip, 0, op, parm); SRCI(ip); op += 62*4/sizeof(op[0]);\ -} - -#define BITBLK64_63(ip, i, op, parm) { ;\ - IPPB(ip, i*64+ 0, parm); *((uint64_t *)op+i*63+ 0) = (uint64_t)SRC(ip, i*64+ 0) | (uint64_t)SRC1(ip, i*64+1) << 63;\ - IPPB(ip, i*64+ 1, parm); *((uint64_t *)op+i*63+ 1) = (uint64_t)SRC(ip, i*64+ 1) >> 1 | (uint64_t)SRC1(ip, i*64+2) << 62;\ - IPPB(ip, i*64+ 2, parm); *((uint64_t *)op+i*63+ 2) = (uint64_t)SRC(ip, i*64+ 2) >> 2 | (uint64_t)SRC1(ip, i*64+3) << 61;\ - IPPB(ip, i*64+ 3, parm); *((uint64_t *)op+i*63+ 3) = (uint64_t)SRC(ip, i*64+ 3) >> 3 | (uint64_t)SRC1(ip, i*64+4) << 60;\ - IPPB(ip, i*64+ 4, parm); *((uint64_t *)op+i*63+ 4) = (uint64_t)SRC(ip, i*64+ 4) >> 4 | (uint64_t)SRC1(ip, i*64+5) << 59;\ - IPPB(ip, i*64+ 5, parm); *((uint64_t *)op+i*63+ 5) = (uint64_t)SRC(ip, i*64+ 5) >> 5 | (uint64_t)SRC1(ip, i*64+6) << 58;\ - IPPB(ip, i*64+ 6, parm); *((uint64_t *)op+i*63+ 6) = (uint64_t)SRC(ip, i*64+ 6) >> 6 | (uint64_t)SRC1(ip, i*64+7) << 57;\ - IPPB(ip, i*64+ 7, parm); *((uint64_t *)op+i*63+ 7) = (uint64_t)SRC(ip, i*64+ 7) >> 7 | (uint64_t)SRC1(ip, i*64+8) << 56;\ - IPPB(ip, i*64+ 8, parm); *((uint64_t *)op+i*63+ 8) = (uint64_t)SRC(ip, i*64+ 8) >> 8 | (uint64_t)SRC1(ip, i*64+9) << 55;\ - IPPB(ip, i*64+ 9, parm); *((uint64_t *)op+i*63+ 9) = (uint64_t)SRC(ip, i*64+ 9) >> 9 | (uint64_t)SRC1(ip, i*64+10) << 54;\ - IPPB(ip, i*64+10, parm); *((uint64_t *)op+i*63+10) = (uint64_t)SRC(ip, i*64+10) >> 10 | (uint64_t)SRC1(ip, i*64+11) << 53;\ - IPPB(ip, i*64+11, parm); *((uint64_t *)op+i*63+11) = (uint64_t)SRC(ip, i*64+11) >> 11 | (uint64_t)SRC1(ip, i*64+12) << 52;\ - IPPB(ip, i*64+12, parm); *((uint64_t *)op+i*63+12) = (uint64_t)SRC(ip, i*64+12) >> 12 | (uint64_t)SRC1(ip, i*64+13) << 51;\ - IPPB(ip, i*64+13, parm); *((uint64_t *)op+i*63+13) = (uint64_t)SRC(ip, i*64+13) >> 13 | (uint64_t)SRC1(ip, i*64+14) << 50;\ - IPPB(ip, i*64+14, parm); *((uint64_t *)op+i*63+14) = (uint64_t)SRC(ip, i*64+14) >> 14 | (uint64_t)SRC1(ip, i*64+15) << 49;\ - IPPB(ip, i*64+15, parm); *((uint64_t *)op+i*63+15) = (uint64_t)SRC(ip, i*64+15) >> 15 | (uint64_t)SRC1(ip, i*64+16) << 48;\ - IPPB(ip, i*64+16, parm); *((uint64_t *)op+i*63+16) = (uint64_t)SRC(ip, i*64+16) >> 16 | (uint64_t)SRC1(ip, i*64+17) << 47;\ - IPPB(ip, i*64+17, parm); *((uint64_t *)op+i*63+17) = (uint64_t)SRC(ip, i*64+17) >> 17 | (uint64_t)SRC1(ip, i*64+18) << 46;\ - IPPB(ip, i*64+18, parm); *((uint64_t *)op+i*63+18) = (uint64_t)SRC(ip, i*64+18) >> 18 | (uint64_t)SRC1(ip, i*64+19) << 45;\ - IPPB(ip, i*64+19, parm); *((uint64_t *)op+i*63+19) = (uint64_t)SRC(ip, i*64+19) >> 19 | (uint64_t)SRC1(ip, i*64+20) << 44;\ - IPPB(ip, i*64+20, parm); *((uint64_t *)op+i*63+20) = (uint64_t)SRC(ip, i*64+20) >> 20 | (uint64_t)SRC1(ip, i*64+21) << 43;\ - IPPB(ip, i*64+21, parm); *((uint64_t *)op+i*63+21) = (uint64_t)SRC(ip, i*64+21) >> 21 | (uint64_t)SRC1(ip, i*64+22) << 42;\ - IPPB(ip, i*64+22, parm); *((uint64_t *)op+i*63+22) = (uint64_t)SRC(ip, i*64+22) >> 22 | (uint64_t)SRC1(ip, i*64+23) << 41;\ - IPPB(ip, i*64+23, parm); *((uint64_t *)op+i*63+23) = (uint64_t)SRC(ip, i*64+23) >> 23 | (uint64_t)SRC1(ip, i*64+24) << 40;\ - IPPB(ip, i*64+24, parm); *((uint64_t *)op+i*63+24) = (uint64_t)SRC(ip, i*64+24) >> 24 | (uint64_t)SRC1(ip, i*64+25) << 39;\ - IPPB(ip, i*64+25, parm); *((uint64_t *)op+i*63+25) = (uint64_t)SRC(ip, i*64+25) >> 25 | (uint64_t)SRC1(ip, i*64+26) << 38;\ - IPPB(ip, i*64+26, parm); *((uint64_t *)op+i*63+26) = (uint64_t)SRC(ip, i*64+26) >> 26 | (uint64_t)SRC1(ip, i*64+27) << 37;\ - IPPB(ip, i*64+27, parm); *((uint64_t *)op+i*63+27) = (uint64_t)SRC(ip, i*64+27) >> 27 | (uint64_t)SRC1(ip, i*64+28) << 36;\ - IPPB(ip, i*64+28, parm); *((uint64_t *)op+i*63+28) = (uint64_t)SRC(ip, i*64+28) >> 28 | (uint64_t)SRC1(ip, i*64+29) << 35;\ - IPPB(ip, i*64+29, parm); *((uint64_t *)op+i*63+29) = (uint64_t)SRC(ip, i*64+29) >> 29 | (uint64_t)SRC1(ip, i*64+30) << 34;\ - IPPB(ip, i*64+30, parm); *((uint64_t *)op+i*63+30) = (uint64_t)SRC(ip, i*64+30) >> 30 | (uint64_t)SRC1(ip, i*64+31) << 33;\ - IPPB(ip, i*64+31, parm); *((uint64_t *)op+i*63+31) = (uint64_t)SRC(ip, i*64+31) >> 31;\ -} - -#define BITPACK64_63(ip, op, parm) { \ - BITBLK64_63(ip, 0, op, parm); SRCI(ip); op += 63*4/sizeof(op[0]);\ -} - -#define BITBLK64_64(ip, i, op, parm) { ;\ - IPPB(ip, i*1+ 0, parm); *((uint64_t *)op+i*1+ 0) = (uint64_t)SRC(ip, i*1+ 0) ;\ -} - -#define BITPACK64_64(ip, op, parm) { \ - BITBLK64_64(ip, 0, op, parm);\ - BITBLK64_64(ip, 1, op, parm);\ - BITBLK64_64(ip, 2, op, parm);\ - BITBLK64_64(ip, 3, op, parm);\ - BITBLK64_64(ip, 4, op, parm);\ - BITBLK64_64(ip, 5, op, parm);\ - BITBLK64_64(ip, 6, op, parm);\ - BITBLK64_64(ip, 7, op, parm);\ - BITBLK64_64(ip, 8, op, parm);\ - BITBLK64_64(ip, 9, op, parm);\ - BITBLK64_64(ip, 10, op, parm);\ - BITBLK64_64(ip, 11, op, parm);\ - BITBLK64_64(ip, 12, op, parm);\ - BITBLK64_64(ip, 13, op, parm);\ - BITBLK64_64(ip, 14, op, parm);\ - BITBLK64_64(ip, 15, op, parm);\ - BITBLK64_64(ip, 16, op, parm);\ - BITBLK64_64(ip, 17, op, parm);\ - BITBLK64_64(ip, 18, op, parm);\ - BITBLK64_64(ip, 19, op, parm);\ - BITBLK64_64(ip, 20, op, parm);\ - BITBLK64_64(ip, 21, op, parm);\ - BITBLK64_64(ip, 22, op, parm);\ - BITBLK64_64(ip, 23, op, parm);\ - BITBLK64_64(ip, 24, op, parm);\ - BITBLK64_64(ip, 25, op, parm);\ - BITBLK64_64(ip, 26, op, parm);\ - BITBLK64_64(ip, 27, op, parm);\ - BITBLK64_64(ip, 28, op, parm);\ - BITBLK64_64(ip, 29, op, parm);\ - BITBLK64_64(ip, 30, op, parm);\ - BITBLK64_64(ip, 31, op, parm); SRCI(ip); op += 64*4/sizeof(op[0]);\ -} - diff --git a/bitpackv.c b/bitpackv.c deleted file mode 100644 index 438df97..0000000 --- a/bitpackv.c +++ /dev/null @@ -1,128 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" SIMD bit packing -#include -#include "bitpack.h" -#include "bitutil.h" - -#define OPPE(__op) -#define IPPE(__op) - -#define PAD8(__x) (((__x)+8-1)/8) - -#define VSTI(ip, i, iv, parm) -#define IPP(ip, i, iv) _mm_loadu_si128(ip++) -#include "bitpack128v_.h" - -unsigned char *bitpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(128*b); BITPACK128V32(in, b, out, 0); return pout; } -#undef VSTI -#undef IPP - -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm_loadu_si128(__ip++); __iv = DELTA128x32(v,__sv); __sv = v -#define IPP(ip, i, __iv) __iv -#include "bitpack128v_.h" - -unsigned char *bitdpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b); - __m128i v,sv = _mm_set1_epi32(start); - BITPACK128V32(in, b, out, sv); - return pout; -} -#undef VSTI - -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm_loadu_si128(__ip++); __iv = _mm_sub_epi32(DELTA128x32(v,__sv),cv); __sv = v - -unsigned char *bitd1pack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b); - __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); - BITPACK128V32(in, b, out, sv); return pout; -} -#undef VSTI -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm_loadu_si128(__ip++); __iv = DELTA128x32(v,__sv); __sv = v; __iv = ZIGZAG128x32(__iv) - -unsigned char *bitzpack128v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(128*b); - __m128i v, sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(1); - BITPACK128V32(in, b, out, sv); - return pout; -} -#undef VSTI -#undef IPP - -#ifdef __AVX2__ -#include -#include -#include - -#define OPPE(__op) -#define IPPE(__op) - -#define PAD8(__x) (((__x)+8-1)/8) -#define OPPE(__op) -#define IPPE(__op) - -#define VSTI(ip, i, iv, parm) -#define IPP(ip, i, iv) _mm256_loadu_si256(ip++) -#include "bitpack256v_.h" -//#include "bitpack.h" -//#include "bitutil.h" - -unsigned char *bitpack256v32(unsigned *__restrict in, unsigned n, unsigned char *__restrict out, unsigned b) { unsigned char *pout = out+PAD8(256*b); BITPACK256V32(in, b, out, 0); return pout; } -#undef VSTI -#undef IPP - -//------------------------------------------------------------------------------------------------------------------------------ -#if 0 -#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); DELTA256x32(v,__sv, __iv) //__sv = v -#define IPP(ip, i, __iv) __iv -#include "bitpack256v_.h" - -unsigned char *bitdpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); - __m256i v; //,sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(); - __m128i sv = _mm_set1_epi32(start); - BITPACK256V32(in, b, out, sv); - return pout; -} -#undef VSTI - -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = _mm256_sub_epi32(DELTA256x32(v,__sv),cv); __sv = v - -unsigned char *bitd1pack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); - __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); - //BITPACK256V32(in, b, out, sv); return pout; -} -#undef VSTI -//------------------------------------------------------------------------------------------------------------------------------ -#define VSTI(__ip, __i, __iv, __sv) v = _mm256_loadu_si256(__ip++); __iv = DELTA256x32(v,__sv); __sv = v; __iv = ZIGZAG256x32(__iv) - -unsigned char *bitzpack256v32(unsigned *__restrict in, unsigned char *__restrict out, unsigned start, unsigned b) { unsigned char *pout = out+PAD8(256*b); - __m256i v, sv = _mm256_set1_epi32(start), cv = _mm256_set1_epi32(1); - //BITPACK256V32(in, b, out, sv); - return pout; -} -#endif -#undef VSTI -#endif - diff --git a/bitunpack128v_.h b/bitunpack128v_.h deleted file mode 100644 index c0ca842..0000000 --- a/bitunpack128v_.h +++ /dev/null @@ -1,2041 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// TurboPFor: Integer Compression SIMD bit unpacking -#define BITUNPACK128V32_0(ip, op, parm) {\ - BITUNBLK128V32_0(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_1(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 27),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 29),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 30),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_1(ip, op, parm) {\ - BITUNBLK128V32_1(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_2(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_2(ip, op, parm) {\ - BITUNBLK128V32_2(ip, 0, op, parm);\ - BITUNBLK128V32_2(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_3(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 27),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_3(ip, op, parm) {\ - BITUNBLK128V32_3(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_4(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*8+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*8+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK128V32_4(ip, op, parm) {\ - BITUNBLK128V32_4(ip, 0, op, parm);\ - BITUNBLK128V32_4(ip, 1, op, parm);\ - BITUNBLK128V32_4(ip, 2, op, parm);\ - BITUNBLK128V32_4(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_5(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_5(ip, op, parm) {\ - BITUNBLK128V32_5(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_6(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_6(ip, op, parm) {\ - BITUNBLK128V32_6(ip, 0, op, parm);\ - BITUNBLK128V32_6(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_7(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_7(ip, op, parm) {\ - BITUNBLK128V32_7(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_8(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*4+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*4+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); VSTO(op,i*4+ 3,ov,parm); ;\ -} - -#define BITUNPACK128V32_8(ip, op, parm) {\ - BITUNBLK128V32_8(ip, 0, op, parm);\ - BITUNBLK128V32_8(ip, 1, op, parm);\ - BITUNBLK128V32_8(ip, 2, op, parm);\ - BITUNBLK128V32_8(ip, 3, op, parm);\ - BITUNBLK128V32_8(ip, 4, op, parm);\ - BITUNBLK128V32_8(ip, 5, op, parm);\ - BITUNBLK128V32_8(ip, 6, op, parm);\ - BITUNBLK128V32_8(ip, 7, op, parm);\ -} - -#define BITUNBLK128V32_9(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_9(ip, op, parm) {\ - BITUNBLK128V32_9(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_10(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_10(ip, op, parm) {\ - BITUNBLK128V32_10(ip, 0, op, parm);\ - BITUNBLK128V32_10(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_11(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 21); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_11(ip, op, parm) {\ - BITUNBLK128V32_11(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_12(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 1,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 3,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK128V32_12(ip, op, parm) {\ - BITUNBLK128V32_12(ip, 0, op, parm);\ - BITUNBLK128V32_12(ip, 1, op, parm);\ - BITUNBLK128V32_12(ip, 2, op, parm);\ - BITUNBLK128V32_12(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_13(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 19); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_13(ip, op, parm) {\ - BITUNBLK128V32_13(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_14(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 18); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_14(ip, op, parm) {\ - BITUNBLK128V32_14(ip, 0, op, parm);\ - BITUNBLK128V32_14(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_15(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm_srli_epi32(iv, 17); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_15(ip, op, parm) {\ - BITUNBLK128V32_15(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_16(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*2+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 16); VSTO(op,i*2+ 1,ov,parm); ;\ -} - -#define BITUNPACK128V32_16(ip, op, parm) {\ - BITUNBLK128V32_16(ip, 0, op, parm);\ - BITUNBLK128V32_16(ip, 1, op, parm);\ - BITUNBLK128V32_16(ip, 2, op, parm);\ - BITUNBLK128V32_16(ip, 3, op, parm);\ - BITUNBLK128V32_16(ip, 4, op, parm);\ - BITUNBLK128V32_16(ip, 5, op, parm);\ - BITUNBLK128V32_16(ip, 6, op, parm);\ - BITUNBLK128V32_16(ip, 7, op, parm);\ - BITUNBLK128V32_16(ip, 8, op, parm);\ - BITUNBLK128V32_16(ip, 9, op, parm);\ - BITUNBLK128V32_16(ip, 10, op, parm);\ - BITUNBLK128V32_16(ip, 11, op, parm);\ - BITUNBLK128V32_16(ip, 12, op, parm);\ - BITUNBLK128V32_16(ip, 13, op, parm);\ - BITUNBLK128V32_16(ip, 14, op, parm);\ - BITUNBLK128V32_16(ip, 15, op, parm);\ -} - -#define BITUNBLK128V32_17(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_17(ip, op, parm) {\ - BITUNBLK128V32_17(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_18(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_18(ip, op, parm) {\ - BITUNBLK128V32_18(ip, 0, op, parm);\ - BITUNBLK128V32_18(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_19(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_19(ip, op, parm) {\ - BITUNBLK128V32_19(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_20(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK128V32_20(ip, op, parm) {\ - BITUNBLK128V32_20(ip, 0, op, parm);\ - BITUNBLK128V32_20(ip, 1, op, parm);\ - BITUNBLK128V32_20(ip, 2, op, parm);\ - BITUNBLK128V32_20(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_21(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_21(ip, op, parm) {\ - BITUNBLK128V32_21(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_22(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_22(ip, op, parm) {\ - BITUNBLK128V32_22(ip, 0, op, parm);\ - BITUNBLK128V32_22(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_23(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_23(ip, op, parm) {\ - BITUNBLK128V32_23(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_24(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); VSTO(op,i*4+ 3,ov,parm); ;\ -} - -#define BITUNPACK128V32_24(ip, op, parm) {\ - BITUNBLK128V32_24(ip, 0, op, parm);\ - BITUNBLK128V32_24(ip, 1, op, parm);\ - BITUNBLK128V32_24(ip, 2, op, parm);\ - BITUNBLK128V32_24(ip, 3, op, parm);\ - BITUNBLK128V32_24(ip, 4, op, parm);\ - BITUNBLK128V32_24(ip, 5, op, parm);\ - BITUNBLK128V32_24(ip, 6, op, parm);\ - BITUNBLK128V32_24(ip, 7, op, parm);\ -} - -#define BITUNBLK128V32_25(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_25(ip, op, parm) {\ - BITUNBLK128V32_25(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_26(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_26(ip, op, parm) {\ - BITUNBLK128V32_26(ip, 0, op, parm);\ - BITUNBLK128V32_26(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_27(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_27(ip, op, parm) {\ - BITUNBLK128V32_27(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_28(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK128V32_28(ip, op, parm) {\ - BITUNBLK128V32_28(ip, 0, op, parm);\ - BITUNBLK128V32_28(ip, 1, op, parm);\ - BITUNBLK128V32_28(ip, 2, op, parm);\ - BITUNBLK128V32_28(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_29(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_29(ip, op, parm) {\ - BITUNBLK128V32_29(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_30(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK128V32_30(ip, op, parm) {\ - BITUNBLK128V32_30(ip, 0, op, parm);\ - BITUNBLK128V32_30(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_31(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK128V32_31(ip, op, parm) {\ - BITUNBLK128V32_31(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_32(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_and_si128( iv ,mv); VSTO(op,i*1+ 0,ov,parm); ;\ -} - -#define BITUNPACK128V32_32(ip, op, parm) {\ - BITUNBLK128V32_32(ip, 0, op, parm);\ - BITUNBLK128V32_32(ip, 1, op, parm);\ - BITUNBLK128V32_32(ip, 2, op, parm);\ - BITUNBLK128V32_32(ip, 3, op, parm);\ - BITUNBLK128V32_32(ip, 4, op, parm);\ - BITUNBLK128V32_32(ip, 5, op, parm);\ - BITUNBLK128V32_32(ip, 6, op, parm);\ - BITUNBLK128V32_32(ip, 7, op, parm);\ - BITUNBLK128V32_32(ip, 8, op, parm);\ - BITUNBLK128V32_32(ip, 9, op, parm);\ - BITUNBLK128V32_32(ip, 10, op, parm);\ - BITUNBLK128V32_32(ip, 11, op, parm);\ - BITUNBLK128V32_32(ip, 12, op, parm);\ - BITUNBLK128V32_32(ip, 13, op, parm);\ - BITUNBLK128V32_32(ip, 14, op, parm);\ - BITUNBLK128V32_32(ip, 15, op, parm);\ - BITUNBLK128V32_32(ip, 16, op, parm);\ - BITUNBLK128V32_32(ip, 17, op, parm);\ - BITUNBLK128V32_32(ip, 18, op, parm);\ - BITUNBLK128V32_32(ip, 19, op, parm);\ - BITUNBLK128V32_32(ip, 20, op, parm);\ - BITUNBLK128V32_32(ip, 21, op, parm);\ - BITUNBLK128V32_32(ip, 22, op, parm);\ - BITUNBLK128V32_32(ip, 23, op, parm);\ - BITUNBLK128V32_32(ip, 24, op, parm);\ - BITUNBLK128V32_32(ip, 25, op, parm);\ - BITUNBLK128V32_32(ip, 26, op, parm);\ - BITUNBLK128V32_32(ip, 27, op, parm);\ - BITUNBLK128V32_32(ip, 28, op, parm);\ - BITUNBLK128V32_32(ip, 29, op, parm);\ - BITUNBLK128V32_32(ip, 30, op, parm);\ - BITUNBLK128V32_32(ip, 31, op, parm);\ -} - -#define BITUNBLK128V32_33(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_33(ip, op, parm) {\ - BITUNBLK128V32_33(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_34(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_34(ip, op, parm) {\ - BITUNBLK128V32_34(ip, 0, op, parm);\ - BITUNBLK128V32_34(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_35(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_35(ip, op, parm) {\ - BITUNBLK128V32_35(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_36(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK128V32_36(ip, op, parm) {\ - BITUNBLK128V32_36(ip, 0, op, parm);\ - BITUNBLK128V32_36(ip, 1, op, parm);\ - BITUNBLK128V32_36(ip, 2, op, parm);\ - BITUNBLK128V32_36(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_37(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_37(ip, op, parm) {\ - BITUNBLK128V32_37(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_38(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_38(ip, op, parm) {\ - BITUNBLK128V32_38(ip, 0, op, parm);\ - BITUNBLK128V32_38(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_39(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_39(ip, op, parm) {\ - BITUNBLK128V32_39(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_40(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 3,ov,parm);;\ -} - -#define BITUNPACK128V32_40(ip, op, parm) {\ - BITUNBLK128V32_40(ip, 0, op, parm);\ - BITUNBLK128V32_40(ip, 1, op, parm);\ - BITUNBLK128V32_40(ip, 2, op, parm);\ - BITUNBLK128V32_40(ip, 3, op, parm);\ - BITUNBLK128V32_40(ip, 4, op, parm);\ - BITUNBLK128V32_40(ip, 5, op, parm);\ - BITUNBLK128V32_40(ip, 6, op, parm);\ - BITUNBLK128V32_40(ip, 7, op, parm);\ -} - -#define BITUNBLK128V32_41(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_41(ip, op, parm) {\ - BITUNBLK128V32_41(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_42(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_42(ip, op, parm) {\ - BITUNBLK128V32_42(ip, 0, op, parm);\ - BITUNBLK128V32_42(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_43(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_43(ip, op, parm) {\ - BITUNBLK128V32_43(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_44(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK128V32_44(ip, op, parm) {\ - BITUNBLK128V32_44(ip, 0, op, parm);\ - BITUNBLK128V32_44(ip, 1, op, parm);\ - BITUNBLK128V32_44(ip, 2, op, parm);\ - BITUNBLK128V32_44(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_45(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_45(ip, op, parm) {\ - BITUNBLK128V32_45(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_46(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_46(ip, op, parm) {\ - BITUNBLK128V32_46(ip, 0, op, parm);\ - BITUNBLK128V32_46(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_47(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_47(ip, op, parm) {\ - BITUNBLK128V32_47(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_48(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*2+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*2+ 1,ov,parm);;\ -} - -#define BITUNPACK128V32_48(ip, op, parm) {\ - BITUNBLK128V32_48(ip, 0, op, parm);\ - BITUNBLK128V32_48(ip, 1, op, parm);\ - BITUNBLK128V32_48(ip, 2, op, parm);\ - BITUNBLK128V32_48(ip, 3, op, parm);\ - BITUNBLK128V32_48(ip, 4, op, parm);\ - BITUNBLK128V32_48(ip, 5, op, parm);\ - BITUNBLK128V32_48(ip, 6, op, parm);\ - BITUNBLK128V32_48(ip, 7, op, parm);\ - BITUNBLK128V32_48(ip, 8, op, parm);\ - BITUNBLK128V32_48(ip, 9, op, parm);\ - BITUNBLK128V32_48(ip, 10, op, parm);\ - BITUNBLK128V32_48(ip, 11, op, parm);\ - BITUNBLK128V32_48(ip, 12, op, parm);\ - BITUNBLK128V32_48(ip, 13, op, parm);\ - BITUNBLK128V32_48(ip, 14, op, parm);\ - BITUNBLK128V32_48(ip, 15, op, parm);\ -} - -#define BITUNBLK128V32_49(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_49(ip, op, parm) {\ - BITUNBLK128V32_49(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_50(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_50(ip, op, parm) {\ - BITUNBLK128V32_50(ip, 0, op, parm);\ - BITUNBLK128V32_50(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_51(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_51(ip, op, parm) {\ - BITUNBLK128V32_51(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_52(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK128V32_52(ip, op, parm) {\ - BITUNBLK128V32_52(ip, 0, op, parm);\ - BITUNBLK128V32_52(ip, 1, op, parm);\ - BITUNBLK128V32_52(ip, 2, op, parm);\ - BITUNBLK128V32_52(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_53(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_53(ip, op, parm) {\ - BITUNBLK128V32_53(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_54(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_54(ip, op, parm) {\ - BITUNBLK128V32_54(ip, 0, op, parm);\ - BITUNBLK128V32_54(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_55(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_55(ip, op, parm) {\ - BITUNBLK128V32_55(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_56(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 3,ov,parm);;\ -} - -#define BITUNPACK128V32_56(ip, op, parm) {\ - BITUNBLK128V32_56(ip, 0, op, parm);\ - BITUNBLK128V32_56(ip, 1, op, parm);\ - BITUNBLK128V32_56(ip, 2, op, parm);\ - BITUNBLK128V32_56(ip, 3, op, parm);\ - BITUNBLK128V32_56(ip, 4, op, parm);\ - BITUNBLK128V32_56(ip, 5, op, parm);\ - BITUNBLK128V32_56(ip, 6, op, parm);\ - BITUNBLK128V32_56(ip, 7, op, parm);\ -} - -#define BITUNBLK128V32_57(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_57(ip, op, parm) {\ - BITUNBLK128V32_57(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_58(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_58(ip, op, parm) {\ - BITUNBLK128V32_58(ip, 0, op, parm);\ - BITUNBLK128V32_58(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_59(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_59(ip, op, parm) {\ - BITUNBLK128V32_59(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_60(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK128V32_60(ip, op, parm) {\ - BITUNBLK128V32_60(ip, 0, op, parm);\ - BITUNBLK128V32_60(ip, 1, op, parm);\ - BITUNBLK128V32_60(ip, 2, op, parm);\ - BITUNBLK128V32_60(ip, 3, op, parm);\ -} - -#define BITUNBLK128V32_61(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_61(ip, op, parm) {\ - BITUNBLK128V32_61(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_62(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK128V32_62(ip, op, parm) {\ - BITUNBLK128V32_62(ip, 0, op, parm);\ - BITUNBLK128V32_62(ip, 1, op, parm);\ -} - -#define BITUNBLK128V32_63(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm_srli_epi32(iv, 1); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 31), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK128V32_63(ip, op, parm) {\ - BITUNBLK128V32_63(ip, 0, op, parm);\ -} - -#define BITUNBLK128V32_64(ip, i, op, parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ - ov = _mm_srli_epi32(iv, 0); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 32), mv)); VSTO(op,i*1+ 0,ov,parm);;\ -} - -#define BITUNPACK128V32_64(ip, op, parm) {\ - BITUNBLK128V32_64(ip, 0, op, parm);\ - BITUNBLK128V32_64(ip, 1, op, parm);\ - BITUNBLK128V32_64(ip, 2, op, parm);\ - BITUNBLK128V32_64(ip, 3, op, parm);\ - BITUNBLK128V32_64(ip, 4, op, parm);\ - BITUNBLK128V32_64(ip, 5, op, parm);\ - BITUNBLK128V32_64(ip, 6, op, parm);\ - BITUNBLK128V32_64(ip, 7, op, parm);\ - BITUNBLK128V32_64(ip, 8, op, parm);\ - BITUNBLK128V32_64(ip, 9, op, parm);\ - BITUNBLK128V32_64(ip, 10, op, parm);\ - BITUNBLK128V32_64(ip, 11, op, parm);\ - BITUNBLK128V32_64(ip, 12, op, parm);\ - BITUNBLK128V32_64(ip, 13, op, parm);\ - BITUNBLK128V32_64(ip, 14, op, parm);\ - BITUNBLK128V32_64(ip, 15, op, parm);\ - BITUNBLK128V32_64(ip, 16, op, parm);\ - BITUNBLK128V32_64(ip, 17, op, parm);\ - BITUNBLK128V32_64(ip, 18, op, parm);\ - BITUNBLK128V32_64(ip, 19, op, parm);\ - BITUNBLK128V32_64(ip, 20, op, parm);\ - BITUNBLK128V32_64(ip, 21, op, parm);\ - BITUNBLK128V32_64(ip, 22, op, parm);\ - BITUNBLK128V32_64(ip, 23, op, parm);\ - BITUNBLK128V32_64(ip, 24, op, parm);\ - BITUNBLK128V32_64(ip, 25, op, parm);\ - BITUNBLK128V32_64(ip, 26, op, parm);\ - BITUNBLK128V32_64(ip, 27, op, parm);\ - BITUNBLK128V32_64(ip, 28, op, parm);\ - BITUNBLK128V32_64(ip, 29, op, parm);\ - BITUNBLK128V32_64(ip, 30, op, parm);\ - BITUNBLK128V32_64(ip, 31, op, parm);\ -} - -#define BITUNPACK128V32(__ip, __nbits, __op, _parm_) { __m128i mv,*_ov=(__m128i *)__op,*_iv=(__m128i *)__ip; \ - switch(__nbits&0x3f) {\ - case 0: BITUNPACK0(_parm_); BITUNPACK128V32_0( _iv, _ov, _parm_); break;\ - case 1: mv = _mm_set1_epi32((1u<< 1)-1); BITUNPACK128V32_1( _iv, _ov, _parm_); break;\ - case 2: mv = _mm_set1_epi32((1u<< 2)-1); BITUNPACK128V32_2( _iv, _ov, _parm_); break;\ - case 3: mv = _mm_set1_epi32((1u<< 3)-1); BITUNPACK128V32_3( _iv, _ov, _parm_); break;\ - case 4: mv = _mm_set1_epi32((1u<< 4)-1); BITUNPACK128V32_4( _iv, _ov, _parm_); break;\ - case 5: mv = _mm_set1_epi32((1u<< 5)-1); BITUNPACK128V32_5( _iv, _ov, _parm_); break;\ - case 6: mv = _mm_set1_epi32((1u<< 6)-1); BITUNPACK128V32_6( _iv, _ov, _parm_); break;\ - case 7: mv = _mm_set1_epi32((1u<< 7)-1); BITUNPACK128V32_7( _iv, _ov, _parm_); break;\ - case 8: mv = _mm_set1_epi32((1u<< 8)-1); BITUNPACK128V32_8( _iv, _ov, _parm_); break;\ - case 9: mv = _mm_set1_epi32((1u<< 9)-1); BITUNPACK128V32_9( _iv, _ov, _parm_); break;\ - case 10: mv = _mm_set1_epi32((1u<<10)-1); BITUNPACK128V32_10(_iv, _ov, _parm_); break;\ - case 11: mv = _mm_set1_epi32((1u<<11)-1); BITUNPACK128V32_11(_iv, _ov, _parm_); break;\ - case 12: mv = _mm_set1_epi32((1u<<12)-1); BITUNPACK128V32_12(_iv, _ov, _parm_); break;\ - case 13: mv = _mm_set1_epi32((1u<<13)-1); BITUNPACK128V32_13(_iv, _ov, _parm_); break;\ - case 14: mv = _mm_set1_epi32((1u<<14)-1); BITUNPACK128V32_14(_iv, _ov, _parm_); break;\ - case 15: mv = _mm_set1_epi32((1u<<15)-1); BITUNPACK128V32_15(_iv, _ov, _parm_); break;\ - case 16: mv = _mm_set1_epi32((1u<<16)-1); BITUNPACK128V32_16(_iv, _ov, _parm_); break;\ - case 17: mv = _mm_set1_epi32((1u<<17)-1); BITUNPACK128V32_17(_iv, _ov, _parm_); break;\ - case 18: mv = _mm_set1_epi32((1u<<18)-1); BITUNPACK128V32_18(_iv, _ov, _parm_); break;\ - case 19: mv = _mm_set1_epi32((1u<<19)-1); BITUNPACK128V32_19(_iv, _ov, _parm_); break;\ - case 20: mv = _mm_set1_epi32((1u<<20)-1); BITUNPACK128V32_20(_iv, _ov, _parm_); break;\ - case 21: mv = _mm_set1_epi32((1u<<21)-1); BITUNPACK128V32_21(_iv, _ov, _parm_); break;\ - case 22: mv = _mm_set1_epi32((1u<<22)-1); BITUNPACK128V32_22(_iv, _ov, _parm_); break;\ - case 23: mv = _mm_set1_epi32((1u<<23)-1); BITUNPACK128V32_23(_iv, _ov, _parm_); break;\ - case 24: mv = _mm_set1_epi32((1u<<24)-1); BITUNPACK128V32_24(_iv, _ov, _parm_); break;\ - case 25: mv = _mm_set1_epi32((1u<<25)-1); BITUNPACK128V32_25(_iv, _ov, _parm_); break;\ - case 26: mv = _mm_set1_epi32((1u<<26)-1); BITUNPACK128V32_26(_iv, _ov, _parm_); break;\ - case 27: mv = _mm_set1_epi32((1u<<27)-1); BITUNPACK128V32_27(_iv, _ov, _parm_); break;\ - case 28: mv = _mm_set1_epi32((1u<<28)-1); BITUNPACK128V32_28(_iv, _ov, _parm_); break;\ - case 29: mv = _mm_set1_epi32((1u<<29)-1); BITUNPACK128V32_29(_iv, _ov, _parm_); break;\ - case 30: mv = _mm_set1_epi32((1u<<30)-1); BITUNPACK128V32_30(_iv, _ov, _parm_); break;\ - case 31: mv = _mm_set1_epi32((1u<<31)-1); BITUNPACK128V32_31(_iv, _ov, _parm_); break;\ - case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACK128V32_32(_iv, _ov, _parm_); break;\ - case 33 ... 63: break;\ - }\ -} - diff --git a/bitunpack256v_.h b/bitunpack256v_.h deleted file mode 100644 index 8aab565..0000000 --- a/bitunpack256v_.h +++ /dev/null @@ -1,2041 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// TurboPFor: Integer Compression SIMD bit unpacking -#define BITUNPACK256V32_0(ip, op, parm) {\ - BITUNBLK256V32_0(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_1(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 25),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 27),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 28),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 29),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 30),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_1(ip, op, parm) {\ - BITUNBLK256V32_1(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_2(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 28),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_2(ip, op, parm) {\ - BITUNBLK256V32_2(ip, 0, op, parm);\ - BITUNBLK256V32_2(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_3(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 27),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 25),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 28),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_3(ip, op, parm) {\ - BITUNBLK256V32_3(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_4(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*8+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*8+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK256V32_4(ip, op, parm) {\ - BITUNBLK256V32_4(ip, 0, op, parm);\ - BITUNBLK256V32_4(ip, 1, op, parm);\ - BITUNBLK256V32_4(ip, 2, op, parm);\ - BITUNBLK256V32_4(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_5(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 25),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 26),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_5(ip, op, parm) {\ - BITUNBLK256V32_5(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_6(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_6(ip, op, parm) {\ - BITUNBLK256V32_6(ip, 0, op, parm);\ - BITUNBLK256V32_6(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_7(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 24),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 23),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_7(ip, op, parm) {\ - BITUNBLK256V32_7(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_8(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*4+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*4+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); VSTO(op,i*4+ 3,ov,parm); ;\ -} - -#define BITUNPACK256V32_8(ip, op, parm) {\ - BITUNBLK256V32_8(ip, 0, op, parm);\ - BITUNBLK256V32_8(ip, 1, op, parm);\ - BITUNBLK256V32_8(ip, 2, op, parm);\ - BITUNBLK256V32_8(ip, 3, op, parm);\ - BITUNBLK256V32_8(ip, 4, op, parm);\ - BITUNBLK256V32_8(ip, 5, op, parm);\ - BITUNBLK256V32_8(ip, 6, op, parm);\ - BITUNBLK256V32_8(ip, 7, op, parm);\ -} - -#define BITUNBLK256V32_9(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 22),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 21),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_9(ip, op, parm) {\ - BITUNBLK256V32_9(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_10(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_10(ip, op, parm) {\ - BITUNBLK256V32_10(ip, 0, op, parm);\ - BITUNBLK256V32_10(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_11(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 19),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 20),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 21); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_11(ip, op, parm) {\ - BITUNBLK256V32_11(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_12(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*8+ 1,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 3,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*8+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK256V32_12(ip, op, parm) {\ - BITUNBLK256V32_12(ip, 0, op, parm);\ - BITUNBLK256V32_12(ip, 1, op, parm);\ - BITUNBLK256V32_12(ip, 2, op, parm);\ - BITUNBLK256V32_12(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_13(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 17),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 18),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 19); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_13(ip, op, parm) {\ - BITUNBLK256V32_13(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_14(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*16+ 1,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 7,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*16+ 8,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 18); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_14(ip, op, parm) {\ - BITUNBLK256V32_14(ip, 0, op, parm);\ - BITUNBLK256V32_14(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_15(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 15),mv); VSTO(op,i*32+ 1,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+15,ov,parm); \ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 16),mv); VSTO(op,i*32+16,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+30,ov,parm); \ - ov = _mm256_srli_epi32(iv, 17); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_15(ip, op, parm) {\ - BITUNBLK256V32_15(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_16(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*2+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 16); VSTO(op,i*2+ 1,ov,parm); ;\ -} - -#define BITUNPACK256V32_16(ip, op, parm) {\ - BITUNBLK256V32_16(ip, 0, op, parm);\ - BITUNBLK256V32_16(ip, 1, op, parm);\ - BITUNBLK256V32_16(ip, 2, op, parm);\ - BITUNBLK256V32_16(ip, 3, op, parm);\ - BITUNBLK256V32_16(ip, 4, op, parm);\ - BITUNBLK256V32_16(ip, 5, op, parm);\ - BITUNBLK256V32_16(ip, 6, op, parm);\ - BITUNBLK256V32_16(ip, 7, op, parm);\ - BITUNBLK256V32_16(ip, 8, op, parm);\ - BITUNBLK256V32_16(ip, 9, op, parm);\ - BITUNBLK256V32_16(ip, 10, op, parm);\ - BITUNBLK256V32_16(ip, 11, op, parm);\ - BITUNBLK256V32_16(ip, 12, op, parm);\ - BITUNBLK256V32_16(ip, 13, op, parm);\ - BITUNBLK256V32_16(ip, 14, op, parm);\ - BITUNBLK256V32_16(ip, 15, op, parm);\ -} - -#define BITUNBLK256V32_17(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 14),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 13),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_17(ip, op, parm) {\ - BITUNBLK256V32_17(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_18(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+11,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*16+13,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_18(ip, op, parm) {\ - BITUNBLK256V32_18(ip, 0, op, parm);\ - BITUNBLK256V32_18(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_19(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 12),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 11),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_19(ip, op, parm) {\ - BITUNBLK256V32_19(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_20(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*8+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*8+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK256V32_20(ip, op, parm) {\ - BITUNBLK256V32_20(ip, 0, op, parm);\ - BITUNBLK256V32_20(ip, 1, op, parm);\ - BITUNBLK256V32_20(ip, 2, op, parm);\ - BITUNBLK256V32_20(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_21(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 10),mv); VSTO(op,i*32+ 2,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 9),mv); VSTO(op,i*32+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+ 8,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+11,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+20,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+23,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+26,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+29,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_21(ip, op, parm) {\ - BITUNBLK256V32_21(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_22(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 3,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*16+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*16+12,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_22(ip, op, parm) {\ - BITUNBLK256V32_22(ip, 0, op, parm);\ - BITUNBLK256V32_22(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_23(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+ 3,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 7,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+14,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 7),mv); VSTO(op,i*32+17,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 8),mv); VSTO(op,i*32+24,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+28,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_23(ip, op, parm) {\ - BITUNBLK256V32_23(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_24(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*4+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); VSTO(op,i*4+ 3,ov,parm); ;\ -} - -#define BITUNPACK256V32_24(ip, op, parm) {\ - BITUNBLK256V32_24(ip, 0, op, parm);\ - BITUNBLK256V32_24(ip, 1, op, parm);\ - BITUNBLK256V32_24(ip, 2, op, parm);\ - BITUNBLK256V32_24(ip, 3, op, parm);\ - BITUNBLK256V32_24(ip, 4, op, parm);\ - BITUNBLK256V32_24(ip, 5, op, parm);\ - BITUNBLK256V32_24(ip, 6, op, parm);\ - BITUNBLK256V32_24(ip, 7, op, parm);\ -} - -#define BITUNBLK256V32_25(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+ 4,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+ 9,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 5),mv); VSTO(op,i*32+13,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+18,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 6),mv); VSTO(op,i*32+22,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+27,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_25(ip, op, parm) {\ - BITUNBLK256V32_25(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_26(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*16+ 5,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*16+10,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_26(ip, op, parm) {\ - BITUNBLK256V32_26(ip, 0, op, parm);\ - BITUNBLK256V32_26(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_27(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+ 6,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 4),mv); VSTO(op,i*32+12,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+19,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 3),mv); VSTO(op,i*32+25,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_27(ip, op, parm) {\ - BITUNBLK256V32_27(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_28(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*8+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); VSTO(op,i*8+ 7,ov,parm); ;\ -} - -#define BITUNPACK256V32_28(ip, op, parm) {\ - BITUNBLK256V32_28(ip, 0, op, parm);\ - BITUNBLK256V32_28(ip, 1, op, parm);\ - BITUNBLK256V32_28(ip, 2, op, parm);\ - BITUNBLK256V32_28(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_29(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 2),mv); VSTO(op,i*32+10,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_and_si256(_mm256_srli_epi32(iv, 1),mv); VSTO(op,i*32+21,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_29(ip, op, parm) {\ - BITUNBLK256V32_29(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_30(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*16+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); VSTO(op,i*16+15,ov,parm); ;\ -} - -#define BITUNPACK256V32_30(ip, op, parm) {\ - BITUNBLK256V32_30(ip, 0, op, parm);\ - BITUNBLK256V32_30(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_31(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*32+ 0,ov,parm); \ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); VSTO(op,i*32+31,ov,parm); ;\ -} - -#define BITUNPACK256V32_31(ip, op, parm) {\ - BITUNBLK256V32_31(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_32(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_and_si256( iv ,mv); VSTO(op,i*1+ 0,ov,parm); ;\ -} - -#define BITUNPACK256V32_32(ip, op, parm) {\ - BITUNBLK256V32_32(ip, 0, op, parm);\ - BITUNBLK256V32_32(ip, 1, op, parm);\ - BITUNBLK256V32_32(ip, 2, op, parm);\ - BITUNBLK256V32_32(ip, 3, op, parm);\ - BITUNBLK256V32_32(ip, 4, op, parm);\ - BITUNBLK256V32_32(ip, 5, op, parm);\ - BITUNBLK256V32_32(ip, 6, op, parm);\ - BITUNBLK256V32_32(ip, 7, op, parm);\ - BITUNBLK256V32_32(ip, 8, op, parm);\ - BITUNBLK256V32_32(ip, 9, op, parm);\ - BITUNBLK256V32_32(ip, 10, op, parm);\ - BITUNBLK256V32_32(ip, 11, op, parm);\ - BITUNBLK256V32_32(ip, 12, op, parm);\ - BITUNBLK256V32_32(ip, 13, op, parm);\ - BITUNBLK256V32_32(ip, 14, op, parm);\ - BITUNBLK256V32_32(ip, 15, op, parm);\ - BITUNBLK256V32_32(ip, 16, op, parm);\ - BITUNBLK256V32_32(ip, 17, op, parm);\ - BITUNBLK256V32_32(ip, 18, op, parm);\ - BITUNBLK256V32_32(ip, 19, op, parm);\ - BITUNBLK256V32_32(ip, 20, op, parm);\ - BITUNBLK256V32_32(ip, 21, op, parm);\ - BITUNBLK256V32_32(ip, 22, op, parm);\ - BITUNBLK256V32_32(ip, 23, op, parm);\ - BITUNBLK256V32_32(ip, 24, op, parm);\ - BITUNBLK256V32_32(ip, 25, op, parm);\ - BITUNBLK256V32_32(ip, 26, op, parm);\ - BITUNBLK256V32_32(ip, 27, op, parm);\ - BITUNBLK256V32_32(ip, 28, op, parm);\ - BITUNBLK256V32_32(ip, 29, op, parm);\ - BITUNBLK256V32_32(ip, 30, op, parm);\ - BITUNBLK256V32_32(ip, 31, op, parm);\ -} - -#define BITUNBLK256V32_33(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_33(ip, op, parm) {\ - BITUNBLK256V32_33(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_34(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_34(ip, op, parm) {\ - BITUNBLK256V32_34(ip, 0, op, parm);\ - BITUNBLK256V32_34(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_35(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_35(ip, op, parm) {\ - BITUNBLK256V32_35(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_36(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK256V32_36(ip, op, parm) {\ - BITUNBLK256V32_36(ip, 0, op, parm);\ - BITUNBLK256V32_36(ip, 1, op, parm);\ - BITUNBLK256V32_36(ip, 2, op, parm);\ - BITUNBLK256V32_36(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_37(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_37(ip, op, parm) {\ - BITUNBLK256V32_37(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_38(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_38(ip, op, parm) {\ - BITUNBLK256V32_38(ip, 0, op, parm);\ - BITUNBLK256V32_38(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_39(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_39(ip, op, parm) {\ - BITUNBLK256V32_39(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_40(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 3,ov,parm);;\ -} - -#define BITUNPACK256V32_40(ip, op, parm) {\ - BITUNBLK256V32_40(ip, 0, op, parm);\ - BITUNBLK256V32_40(ip, 1, op, parm);\ - BITUNBLK256V32_40(ip, 2, op, parm);\ - BITUNBLK256V32_40(ip, 3, op, parm);\ - BITUNBLK256V32_40(ip, 4, op, parm);\ - BITUNBLK256V32_40(ip, 5, op, parm);\ - BITUNBLK256V32_40(ip, 6, op, parm);\ - BITUNBLK256V32_40(ip, 7, op, parm);\ -} - -#define BITUNBLK256V32_41(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_41(ip, op, parm) {\ - BITUNBLK256V32_41(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_42(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_42(ip, op, parm) {\ - BITUNBLK256V32_42(ip, 0, op, parm);\ - BITUNBLK256V32_42(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_43(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_43(ip, op, parm) {\ - BITUNBLK256V32_43(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_44(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK256V32_44(ip, op, parm) {\ - BITUNBLK256V32_44(ip, 0, op, parm);\ - BITUNBLK256V32_44(ip, 1, op, parm);\ - BITUNBLK256V32_44(ip, 2, op, parm);\ - BITUNBLK256V32_44(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_45(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_45(ip, op, parm) {\ - BITUNBLK256V32_45(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_46(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_46(ip, op, parm) {\ - BITUNBLK256V32_46(ip, 0, op, parm);\ - BITUNBLK256V32_46(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_47(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_47(ip, op, parm) {\ - BITUNBLK256V32_47(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_48(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*2+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*2+ 1,ov,parm);;\ -} - -#define BITUNPACK256V32_48(ip, op, parm) {\ - BITUNBLK256V32_48(ip, 0, op, parm);\ - BITUNBLK256V32_48(ip, 1, op, parm);\ - BITUNBLK256V32_48(ip, 2, op, parm);\ - BITUNBLK256V32_48(ip, 3, op, parm);\ - BITUNBLK256V32_48(ip, 4, op, parm);\ - BITUNBLK256V32_48(ip, 5, op, parm);\ - BITUNBLK256V32_48(ip, 6, op, parm);\ - BITUNBLK256V32_48(ip, 7, op, parm);\ - BITUNBLK256V32_48(ip, 8, op, parm);\ - BITUNBLK256V32_48(ip, 9, op, parm);\ - BITUNBLK256V32_48(ip, 10, op, parm);\ - BITUNBLK256V32_48(ip, 11, op, parm);\ - BITUNBLK256V32_48(ip, 12, op, parm);\ - BITUNBLK256V32_48(ip, 13, op, parm);\ - BITUNBLK256V32_48(ip, 14, op, parm);\ - BITUNBLK256V32_48(ip, 15, op, parm);\ -} - -#define BITUNBLK256V32_49(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_49(ip, op, parm) {\ - BITUNBLK256V32_49(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_50(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_50(ip, op, parm) {\ - BITUNBLK256V32_50(ip, 0, op, parm);\ - BITUNBLK256V32_50(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_51(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_51(ip, op, parm) {\ - BITUNBLK256V32_51(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_52(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK256V32_52(ip, op, parm) {\ - BITUNBLK256V32_52(ip, 0, op, parm);\ - BITUNBLK256V32_52(ip, 1, op, parm);\ - BITUNBLK256V32_52(ip, 2, op, parm);\ - BITUNBLK256V32_52(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_53(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_53(ip, op, parm) {\ - BITUNBLK256V32_53(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_54(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_54(ip, op, parm) {\ - BITUNBLK256V32_54(ip, 0, op, parm);\ - BITUNBLK256V32_54(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_55(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_55(ip, op, parm) {\ - BITUNBLK256V32_55(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_56(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*4+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*4+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*4+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*4+ 3,ov,parm);;\ -} - -#define BITUNPACK256V32_56(ip, op, parm) {\ - BITUNBLK256V32_56(ip, 0, op, parm);\ - BITUNBLK256V32_56(ip, 1, op, parm);\ - BITUNBLK256V32_56(ip, 2, op, parm);\ - BITUNBLK256V32_56(ip, 3, op, parm);\ - BITUNBLK256V32_56(ip, 4, op, parm);\ - BITUNBLK256V32_56(ip, 5, op, parm);\ - BITUNBLK256V32_56(ip, 6, op, parm);\ - BITUNBLK256V32_56(ip, 7, op, parm);\ -} - -#define BITUNBLK256V32_57(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_57(ip, op, parm) {\ - BITUNBLK256V32_57(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_58(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_58(ip, op, parm) {\ - BITUNBLK256V32_58(ip, 0, op, parm);\ - BITUNBLK256V32_58(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_59(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_59(ip, op, parm) {\ - BITUNBLK256V32_59(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_60(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*8+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*8+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*8+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*8+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*8+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*8+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*8+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*8+ 7,ov,parm);;\ -} - -#define BITUNPACK256V32_60(ip, op, parm) {\ - BITUNBLK256V32_60(ip, 0, op, parm);\ - BITUNBLK256V32_60(ip, 1, op, parm);\ - BITUNBLK256V32_60(ip, 2, op, parm);\ - BITUNBLK256V32_60(ip, 3, op, parm);\ -} - -#define BITUNBLK256V32_61(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_61(ip, op, parm) {\ - BITUNBLK256V32_61(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_62(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*16+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*16+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*16+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*16+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*16+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*16+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*16+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*16+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*16+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*16+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*16+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*16+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*16+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*16+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*16+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*16+15,ov,parm);;\ -} - -#define BITUNPACK256V32_62(ip, op, parm) {\ - BITUNBLK256V32_62(ip, 0, op, parm);\ - BITUNBLK256V32_62(ip, 1, op, parm);\ -} - -#define BITUNBLK256V32_63(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*32+ 0,ov,parm);\ - ov = _mm256_srli_epi32(iv, 31); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 1), mv)); VSTO(op,i*32+ 1,ov,parm);\ - ov = _mm256_srli_epi32(iv, 30); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 2), mv)); VSTO(op,i*32+ 2,ov,parm);\ - ov = _mm256_srli_epi32(iv, 29); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 3), mv)); VSTO(op,i*32+ 3,ov,parm);\ - ov = _mm256_srli_epi32(iv, 28); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 4), mv)); VSTO(op,i*32+ 4,ov,parm);\ - ov = _mm256_srli_epi32(iv, 27); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 5), mv)); VSTO(op,i*32+ 5,ov,parm);\ - ov = _mm256_srli_epi32(iv, 26); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 6), mv)); VSTO(op,i*32+ 6,ov,parm);\ - ov = _mm256_srli_epi32(iv, 25); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 7), mv)); VSTO(op,i*32+ 7,ov,parm);\ - ov = _mm256_srli_epi32(iv, 24); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 8), mv)); VSTO(op,i*32+ 8,ov,parm);\ - ov = _mm256_srli_epi32(iv, 23); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 9), mv)); VSTO(op,i*32+ 9,ov,parm);\ - ov = _mm256_srli_epi32(iv, 22); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 10), mv)); VSTO(op,i*32+10,ov,parm);\ - ov = _mm256_srli_epi32(iv, 21); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 11), mv)); VSTO(op,i*32+11,ov,parm);\ - ov = _mm256_srli_epi32(iv, 20); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 12), mv)); VSTO(op,i*32+12,ov,parm);\ - ov = _mm256_srli_epi32(iv, 19); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 13), mv)); VSTO(op,i*32+13,ov,parm);\ - ov = _mm256_srli_epi32(iv, 18); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 14), mv)); VSTO(op,i*32+14,ov,parm);\ - ov = _mm256_srli_epi32(iv, 17); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 15), mv)); VSTO(op,i*32+15,ov,parm);\ - ov = _mm256_srli_epi32(iv, 16); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 16), mv)); VSTO(op,i*32+16,ov,parm);\ - ov = _mm256_srli_epi32(iv, 15); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 17), mv)); VSTO(op,i*32+17,ov,parm);\ - ov = _mm256_srli_epi32(iv, 14); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 18), mv)); VSTO(op,i*32+18,ov,parm);\ - ov = _mm256_srli_epi32(iv, 13); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 19), mv)); VSTO(op,i*32+19,ov,parm);\ - ov = _mm256_srli_epi32(iv, 12); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 20), mv)); VSTO(op,i*32+20,ov,parm);\ - ov = _mm256_srli_epi32(iv, 11); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 21), mv)); VSTO(op,i*32+21,ov,parm);\ - ov = _mm256_srli_epi32(iv, 10); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 22), mv)); VSTO(op,i*32+22,ov,parm);\ - ov = _mm256_srli_epi32(iv, 9); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 23), mv)); VSTO(op,i*32+23,ov,parm);\ - ov = _mm256_srli_epi32(iv, 8); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 24), mv)); VSTO(op,i*32+24,ov,parm);\ - ov = _mm256_srli_epi32(iv, 7); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 25), mv)); VSTO(op,i*32+25,ov,parm);\ - ov = _mm256_srli_epi32(iv, 6); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 26), mv)); VSTO(op,i*32+26,ov,parm);\ - ov = _mm256_srli_epi32(iv, 5); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 27), mv)); VSTO(op,i*32+27,ov,parm);\ - ov = _mm256_srli_epi32(iv, 4); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 28), mv)); VSTO(op,i*32+28,ov,parm);\ - ov = _mm256_srli_epi32(iv, 3); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 29), mv)); VSTO(op,i*32+29,ov,parm);\ - ov = _mm256_srli_epi32(iv, 2); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 30), mv)); VSTO(op,i*32+30,ov,parm);\ - ov = _mm256_srli_epi32(iv, 1); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 31), mv)); VSTO(op,i*32+31,ov,parm);;\ -} - -#define BITUNPACK256V32_63(ip, op, parm) {\ - BITUNBLK256V32_63(ip, 0, op, parm);\ -} - -#define BITUNBLK256V32_64(ip, i, op, parm) { __m256i ov,iv = _mm256_loadu_si256((__m256i *)ip++);\ - ov = _mm256_srli_epi32(iv, 0); iv = _mm256_loadu_si256((__m128i *)ip++); ov = _mm256_or_si256(ov, _mm256_and_si256(_mm256_slli_epi32(iv, 32), mv)); VSTO(op,i*1+ 0,ov,parm);;\ -} - -#define BITUNPACK256V32_64(ip, op, parm) {\ - BITUNBLK256V32_64(ip, 0, op, parm);\ - BITUNBLK256V32_64(ip, 1, op, parm);\ - BITUNBLK256V32_64(ip, 2, op, parm);\ - BITUNBLK256V32_64(ip, 3, op, parm);\ - BITUNBLK256V32_64(ip, 4, op, parm);\ - BITUNBLK256V32_64(ip, 5, op, parm);\ - BITUNBLK256V32_64(ip, 6, op, parm);\ - BITUNBLK256V32_64(ip, 7, op, parm);\ - BITUNBLK256V32_64(ip, 8, op, parm);\ - BITUNBLK256V32_64(ip, 9, op, parm);\ - BITUNBLK256V32_64(ip, 10, op, parm);\ - BITUNBLK256V32_64(ip, 11, op, parm);\ - BITUNBLK256V32_64(ip, 12, op, parm);\ - BITUNBLK256V32_64(ip, 13, op, parm);\ - BITUNBLK256V32_64(ip, 14, op, parm);\ - BITUNBLK256V32_64(ip, 15, op, parm);\ - BITUNBLK256V32_64(ip, 16, op, parm);\ - BITUNBLK256V32_64(ip, 17, op, parm);\ - BITUNBLK256V32_64(ip, 18, op, parm);\ - BITUNBLK256V32_64(ip, 19, op, parm);\ - BITUNBLK256V32_64(ip, 20, op, parm);\ - BITUNBLK256V32_64(ip, 21, op, parm);\ - BITUNBLK256V32_64(ip, 22, op, parm);\ - BITUNBLK256V32_64(ip, 23, op, parm);\ - BITUNBLK256V32_64(ip, 24, op, parm);\ - BITUNBLK256V32_64(ip, 25, op, parm);\ - BITUNBLK256V32_64(ip, 26, op, parm);\ - BITUNBLK256V32_64(ip, 27, op, parm);\ - BITUNBLK256V32_64(ip, 28, op, parm);\ - BITUNBLK256V32_64(ip, 29, op, parm);\ - BITUNBLK256V32_64(ip, 30, op, parm);\ - BITUNBLK256V32_64(ip, 31, op, parm);\ -} - -#define BITUNPACK256V32(__ip, __nbits, __op, _parm_) { __m256i mv,*_ov=(__m256i *)__op,*_iv=(__m256i *)__ip; \ - switch(__nbits&0x3f) {\ - case 0: BITUNPACK0(_parm_); BITUNPACK256V32_0( _iv, _ov, _parm_); break;\ - case 1: mv = _mm256_set1_epi32((1u<< 1)-1); BITUNPACK256V32_1( _iv, _ov, _parm_); break;\ - case 2: mv = _mm256_set1_epi32((1u<< 2)-1); BITUNPACK256V32_2( _iv, _ov, _parm_); break;\ - case 3: mv = _mm256_set1_epi32((1u<< 3)-1); BITUNPACK256V32_3( _iv, _ov, _parm_); break;\ - case 4: mv = _mm256_set1_epi32((1u<< 4)-1); BITUNPACK256V32_4( _iv, _ov, _parm_); break;\ - case 5: mv = _mm256_set1_epi32((1u<< 5)-1); BITUNPACK256V32_5( _iv, _ov, _parm_); break;\ - case 6: mv = _mm256_set1_epi32((1u<< 6)-1); BITUNPACK256V32_6( _iv, _ov, _parm_); break;\ - case 7: mv = _mm256_set1_epi32((1u<< 7)-1); BITUNPACK256V32_7( _iv, _ov, _parm_); break;\ - case 8: mv = _mm256_set1_epi32((1u<< 8)-1); BITUNPACK256V32_8( _iv, _ov, _parm_); break;\ - case 9: mv = _mm256_set1_epi32((1u<< 9)-1); BITUNPACK256V32_9( _iv, _ov, _parm_); break;\ - case 10: mv = _mm256_set1_epi32((1u<<10)-1); BITUNPACK256V32_10(_iv, _ov, _parm_); break;\ - case 11: mv = _mm256_set1_epi32((1u<<11)-1); BITUNPACK256V32_11(_iv, _ov, _parm_); break;\ - case 12: mv = _mm256_set1_epi32((1u<<12)-1); BITUNPACK256V32_12(_iv, _ov, _parm_); break;\ - case 13: mv = _mm256_set1_epi32((1u<<13)-1); BITUNPACK256V32_13(_iv, _ov, _parm_); break;\ - case 14: mv = _mm256_set1_epi32((1u<<14)-1); BITUNPACK256V32_14(_iv, _ov, _parm_); break;\ - case 15: mv = _mm256_set1_epi32((1u<<15)-1); BITUNPACK256V32_15(_iv, _ov, _parm_); break;\ - case 16: mv = _mm256_set1_epi32((1u<<16)-1); BITUNPACK256V32_16(_iv, _ov, _parm_); break;\ - case 17: mv = _mm256_set1_epi32((1u<<17)-1); BITUNPACK256V32_17(_iv, _ov, _parm_); break;\ - case 18: mv = _mm256_set1_epi32((1u<<18)-1); BITUNPACK256V32_18(_iv, _ov, _parm_); break;\ - case 19: mv = _mm256_set1_epi32((1u<<19)-1); BITUNPACK256V32_19(_iv, _ov, _parm_); break;\ - case 20: mv = _mm256_set1_epi32((1u<<20)-1); BITUNPACK256V32_20(_iv, _ov, _parm_); break;\ - case 21: mv = _mm256_set1_epi32((1u<<21)-1); BITUNPACK256V32_21(_iv, _ov, _parm_); break;\ - case 22: mv = _mm256_set1_epi32((1u<<22)-1); BITUNPACK256V32_22(_iv, _ov, _parm_); break;\ - case 23: mv = _mm256_set1_epi32((1u<<23)-1); BITUNPACK256V32_23(_iv, _ov, _parm_); break;\ - case 24: mv = _mm256_set1_epi32((1u<<24)-1); BITUNPACK256V32_24(_iv, _ov, _parm_); break;\ - case 25: mv = _mm256_set1_epi32((1u<<25)-1); BITUNPACK256V32_25(_iv, _ov, _parm_); break;\ - case 26: mv = _mm256_set1_epi32((1u<<26)-1); BITUNPACK256V32_26(_iv, _ov, _parm_); break;\ - case 27: mv = _mm256_set1_epi32((1u<<27)-1); BITUNPACK256V32_27(_iv, _ov, _parm_); break;\ - case 28: mv = _mm256_set1_epi32((1u<<28)-1); BITUNPACK256V32_28(_iv, _ov, _parm_); break;\ - case 29: mv = _mm256_set1_epi32((1u<<29)-1); BITUNPACK256V32_29(_iv, _ov, _parm_); break;\ - case 30: mv = _mm256_set1_epi32((1u<<30)-1); BITUNPACK256V32_30(_iv, _ov, _parm_); break;\ - case 31: mv = _mm256_set1_epi32((1u<<31)-1); BITUNPACK256V32_31(_iv, _ov, _parm_); break;\ - case 32: mv = _mm256_set1_epi32((1ull<<32)-1);BITUNPACK256V32_32(_iv, _ov, _parm_); break;\ - case 33 ... 63: break;\ - }\ -} - diff --git a/bitunpack64_.h b/bitunpack64_.h deleted file mode 100644 index 4eff260..0000000 --- a/bitunpack64_.h +++ /dev/null @@ -1,3104 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// bitunpack include -#define BITUNBLK32_0(ip, i, op, parm) { \ - DST(op,i*0+ 0, 0, parm);\ - DST(op,i*0+ 1, 0, parm);\ - DST(op,i*0+ 2, 0, parm);\ - DST(op,i*0+ 3, 0, parm);\ - DST(op,i*0+ 4, 0, parm);\ - DST(op,i*0+ 5, 0, parm);\ - DST(op,i*0+ 6, 0, parm);\ - DST(op,i*0+ 7, 0, parm);\ - DST(op,i*0+ 8, 0, parm);\ - DST(op,i*0+ 9, 0, parm);\ - DST(op,i*0+10, 0, parm);\ - DST(op,i*0+11, 0, parm);\ - DST(op,i*0+12, 0, parm);\ - DST(op,i*0+13, 0, parm);\ - DST(op,i*0+14, 0, parm);\ - DST(op,i*0+15, 0, parm);\ - DST(op,i*0+16, 0, parm);\ - DST(op,i*0+17, 0, parm);\ - DST(op,i*0+18, 0, parm);\ - DST(op,i*0+19, 0, parm);\ - DST(op,i*0+20, 0, parm);\ - DST(op,i*0+21, 0, parm);\ - DST(op,i*0+22, 0, parm);\ - DST(op,i*0+23, 0, parm);\ - DST(op,i*0+24, 0, parm);\ - DST(op,i*0+25, 0, parm);\ - DST(op,i*0+26, 0, parm);\ - DST(op,i*0+27, 0, parm);\ - DST(op,i*0+28, 0, parm);\ - DST(op,i*0+29, 0, parm);\ - DST(op,i*0+30, 0, parm);\ - DST(op,i*0+31, 0, parm);;\ -} - -#define BITUNPACK64_0(ip, op, parm) { \ - BITUNBLK32_0(ip, 0, op, parm); DSTI(op);\ -} - -#define BITUNBLK32_1(ip, i, op, parm) { register uint32_t w0 = *(uint32_t *)(ip+(i*1+0)*4/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x1, parm);\ - DST(op,i*32+ 1, (w0 >> 1) & 0x1, parm);\ - DST(op,i*32+ 2, (w0 >> 2) & 0x1, parm);\ - DST(op,i*32+ 3, (w0 >> 3) & 0x1, parm);\ - DST(op,i*32+ 4, (w0 >> 4) & 0x1, parm);\ - DST(op,i*32+ 5, (w0 >> 5) & 0x1, parm);\ - DST(op,i*32+ 6, (w0 >> 6) & 0x1, parm);\ - DST(op,i*32+ 7, (w0 >> 7) & 0x1, parm);\ - DST(op,i*32+ 8, (w0 >> 8) & 0x1, parm);\ - DST(op,i*32+ 9, (w0 >> 9) & 0x1, parm);\ - DST(op,i*32+10, (w0 >> 10) & 0x1, parm);\ - DST(op,i*32+11, (w0 >> 11) & 0x1, parm);\ - DST(op,i*32+12, (w0 >> 12) & 0x1, parm);\ - DST(op,i*32+13, (w0 >> 13) & 0x1, parm);\ - DST(op,i*32+14, (w0 >> 14) & 0x1, parm);\ - DST(op,i*32+15, (w0 >> 15) & 0x1, parm);\ - DST(op,i*32+16, (w0 >> 16) & 0x1, parm);\ - DST(op,i*32+17, (w0 >> 17) & 0x1, parm);\ - DST(op,i*32+18, (w0 >> 18) & 0x1, parm);\ - DST(op,i*32+19, (w0 >> 19) & 0x1, parm);\ - DST(op,i*32+20, (w0 >> 20) & 0x1, parm);\ - DST(op,i*32+21, (w0 >> 21) & 0x1, parm);\ - DST(op,i*32+22, (w0 >> 22) & 0x1, parm);\ - DST(op,i*32+23, (w0 >> 23) & 0x1, parm);\ - DST(op,i*32+24, (w0 >> 24) & 0x1, parm);\ - DST(op,i*32+25, (w0 >> 25) & 0x1, parm);\ - DST(op,i*32+26, (w0 >> 26) & 0x1, parm);\ - DST(op,i*32+27, (w0 >> 27) & 0x1, parm);\ - DST(op,i*32+28, (w0 >> 28) & 0x1, parm);\ - DST(op,i*32+29, (w0 >> 29) & 0x1, parm);\ - DST(op,i*32+30, (w0 >> 30) & 0x1, parm);\ - DST(op,i*32+31, (w0 >> 31) , parm);;\ -} - -#define BITUNPACK64_1(ip, op, parm) { \ - BITUNBLK32_1(ip, 0, op, parm); DSTI(op); ip += 1*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_2(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3, parm);\ - DST(op,i*32+ 1, (w0 >> 2) & 0x3, parm);\ - DST(op,i*32+ 2, (w0 >> 4) & 0x3, parm);\ - DST(op,i*32+ 3, (w0 >> 6) & 0x3, parm);\ - DST(op,i*32+ 4, (w0 >> 8) & 0x3, parm);\ - DST(op,i*32+ 5, (w0 >> 10) & 0x3, parm);\ - DST(op,i*32+ 6, (w0 >> 12) & 0x3, parm);\ - DST(op,i*32+ 7, (w0 >> 14) & 0x3, parm);\ - DST(op,i*32+ 8, (w0 >> 16) & 0x3, parm);\ - DST(op,i*32+ 9, (w0 >> 18) & 0x3, parm);\ - DST(op,i*32+10, (w0 >> 20) & 0x3, parm);\ - DST(op,i*32+11, (w0 >> 22) & 0x3, parm);\ - DST(op,i*32+12, (w0 >> 24) & 0x3, parm);\ - DST(op,i*32+13, (w0 >> 26) & 0x3, parm);\ - DST(op,i*32+14, (w0 >> 28) & 0x3, parm);\ - DST(op,i*32+15, (w0 >> 30) & 0x3, parm);\ - DST(op,i*32+16, (w0 >> 32) & 0x3, parm);\ - DST(op,i*32+17, (w0 >> 34) & 0x3, parm);\ - DST(op,i*32+18, (w0 >> 36) & 0x3, parm);\ - DST(op,i*32+19, (w0 >> 38) & 0x3, parm);\ - DST(op,i*32+20, (w0 >> 40) & 0x3, parm);\ - DST(op,i*32+21, (w0 >> 42) & 0x3, parm);\ - DST(op,i*32+22, (w0 >> 44) & 0x3, parm);\ - DST(op,i*32+23, (w0 >> 46) & 0x3, parm);\ - DST(op,i*32+24, (w0 >> 48) & 0x3, parm);\ - DST(op,i*32+25, (w0 >> 50) & 0x3, parm);\ - DST(op,i*32+26, (w0 >> 52) & 0x3, parm);\ - DST(op,i*32+27, (w0 >> 54) & 0x3, parm);\ - DST(op,i*32+28, (w0 >> 56) & 0x3, parm);\ - DST(op,i*32+29, (w0 >> 58) & 0x3, parm);\ - DST(op,i*32+30, (w0 >> 60) & 0x3, parm);\ - DST(op,i*32+31, (w0 >> 62) , parm);;\ -} - -#define BITUNPACK64_2(ip, op, parm) { \ - BITUNBLK64_2(ip, 0, op, parm); DSTI(op); ip += 2*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_3(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7, parm);\ - DST(op,i*64+ 1, (w0 >> 3) & 0x7, parm);\ - DST(op,i*64+ 2, (w0 >> 6) & 0x7, parm);\ - DST(op,i*64+ 3, (w0 >> 9) & 0x7, parm);\ - DST(op,i*64+ 4, (w0 >> 12) & 0x7, parm);\ - DST(op,i*64+ 5, (w0 >> 15) & 0x7, parm);\ - DST(op,i*64+ 6, (w0 >> 18) & 0x7, parm);\ - DST(op,i*64+ 7, (w0 >> 21) & 0x7, parm);\ - DST(op,i*64+ 8, (w0 >> 24) & 0x7, parm);\ - DST(op,i*64+ 9, (w0 >> 27) & 0x7, parm);\ - DST(op,i*64+10, (w0 >> 30) & 0x7, parm);\ - DST(op,i*64+11, (w0 >> 33) & 0x7, parm);\ - DST(op,i*64+12, (w0 >> 36) & 0x7, parm);\ - DST(op,i*64+13, (w0 >> 39) & 0x7, parm);\ - DST(op,i*64+14, (w0 >> 42) & 0x7, parm);\ - DST(op,i*64+15, (w0 >> 45) & 0x7, parm);\ - DST(op,i*64+16, (w0 >> 48) & 0x7, parm);\ - DST(op,i*64+17, (w0 >> 51) & 0x7, parm);\ - DST(op,i*64+18, (w0 >> 54) & 0x7, parm);\ - DST(op,i*64+19, (w0 >> 57) & 0x7, parm);\ - DST(op,i*64+20, (w0 >> 60) & 0x7, parm); register uint64_t w1 = *(uint32_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w0 >> 63) | (w1 << 1) & 0x7, parm);\ - DST(op,i*64+22, (w1 >> 2) & 0x7, parm);\ - DST(op,i*64+23, (w1 >> 5) & 0x7, parm);\ - DST(op,i*64+24, (w1 >> 8) & 0x7, parm);\ - DST(op,i*64+25, (w1 >> 11) & 0x7, parm);\ - DST(op,i*64+26, (w1 >> 14) & 0x7, parm);\ - DST(op,i*64+27, (w1 >> 17) & 0x7, parm);\ - DST(op,i*64+28, (w1 >> 20) & 0x7, parm);\ - DST(op,i*64+29, (w1 >> 23) & 0x7, parm);\ - DST(op,i*64+30, (w1 >> 26) & 0x7, parm);\ - DST(op,i*64+31, (w1 >> 29) & 0x7, parm);;\ -} - -#define BITUNPACK64_3(ip, op, parm) { \ - BITUNBLK64_3(ip, 0, op, parm); DSTI(op); ip += 3*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_4(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xf, parm);\ - DST(op,i*16+ 1, (w0 >> 4) & 0xf, parm);\ - DST(op,i*16+ 2, (w0 >> 8) & 0xf, parm);\ - DST(op,i*16+ 3, (w0 >> 12) & 0xf, parm);\ - DST(op,i*16+ 4, (w0 >> 16) & 0xf, parm);\ - DST(op,i*16+ 5, (w0 >> 20) & 0xf, parm);\ - DST(op,i*16+ 6, (w0 >> 24) & 0xf, parm);\ - DST(op,i*16+ 7, (w0 >> 28) & 0xf, parm);\ - DST(op,i*16+ 8, (w0 >> 32) & 0xf, parm);\ - DST(op,i*16+ 9, (w0 >> 36) & 0xf, parm);\ - DST(op,i*16+10, (w0 >> 40) & 0xf, parm);\ - DST(op,i*16+11, (w0 >> 44) & 0xf, parm);\ - DST(op,i*16+12, (w0 >> 48) & 0xf, parm);\ - DST(op,i*16+13, (w0 >> 52) & 0xf, parm);\ - DST(op,i*16+14, (w0 >> 56) & 0xf, parm);\ - DST(op,i*16+15, (w0 >> 60) , parm);;\ -} - -#define BITUNPACK64_4(ip, op, parm) { \ - BITUNBLK64_4(ip, 0, op, parm);\ - BITUNBLK64_4(ip, 1, op, parm); DSTI(op); ip += 4*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_5(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1f, parm);\ - DST(op,i*64+ 1, (w0 >> 5) & 0x1f, parm);\ - DST(op,i*64+ 2, (w0 >> 10) & 0x1f, parm);\ - DST(op,i*64+ 3, (w0 >> 15) & 0x1f, parm);\ - DST(op,i*64+ 4, (w0 >> 20) & 0x1f, parm);\ - DST(op,i*64+ 5, (w0 >> 25) & 0x1f, parm);\ - DST(op,i*64+ 6, (w0 >> 30) & 0x1f, parm);\ - DST(op,i*64+ 7, (w0 >> 35) & 0x1f, parm);\ - DST(op,i*64+ 8, (w0 >> 40) & 0x1f, parm);\ - DST(op,i*64+ 9, (w0 >> 45) & 0x1f, parm);\ - DST(op,i*64+10, (w0 >> 50) & 0x1f, parm);\ - DST(op,i*64+11, (w0 >> 55) & 0x1f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w0 >> 60) | (w1 << 4) & 0x1f, parm);\ - DST(op,i*64+13, (w1 >> 1) & 0x1f, parm);\ - DST(op,i*64+14, (w1 >> 6) & 0x1f, parm);\ - DST(op,i*64+15, (w1 >> 11) & 0x1f, parm);\ - DST(op,i*64+16, (w1 >> 16) & 0x1f, parm);\ - DST(op,i*64+17, (w1 >> 21) & 0x1f, parm);\ - DST(op,i*64+18, (w1 >> 26) & 0x1f, parm);\ - DST(op,i*64+19, (w1 >> 31) & 0x1f, parm);\ - DST(op,i*64+20, (w1 >> 36) & 0x1f, parm);\ - DST(op,i*64+21, (w1 >> 41) & 0x1f, parm);\ - DST(op,i*64+22, (w1 >> 46) & 0x1f, parm);\ - DST(op,i*64+23, (w1 >> 51) & 0x1f, parm);\ - DST(op,i*64+24, (w1 >> 56) & 0x1f, parm); register uint64_t w2 = *(uint32_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w1 >> 61) | (w2 << 3) & 0x1f, parm);\ - DST(op,i*64+26, (w2 >> 2) & 0x1f, parm);\ - DST(op,i*64+27, (w2 >> 7) & 0x1f, parm);\ - DST(op,i*64+28, (w2 >> 12) & 0x1f, parm);\ - DST(op,i*64+29, (w2 >> 17) & 0x1f, parm);\ - DST(op,i*64+30, (w2 >> 22) & 0x1f, parm);\ - DST(op,i*64+31, (w2 >> 27) & 0x1f, parm);;\ -} - -#define BITUNPACK64_5(ip, op, parm) { \ - BITUNBLK64_5(ip, 0, op, parm); DSTI(op); ip += 5*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_6(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3f, parm);\ - DST(op,i*32+ 1, (w0 >> 6) & 0x3f, parm);\ - DST(op,i*32+ 2, (w0 >> 12) & 0x3f, parm);\ - DST(op,i*32+ 3, (w0 >> 18) & 0x3f, parm);\ - DST(op,i*32+ 4, (w0 >> 24) & 0x3f, parm);\ - DST(op,i*32+ 5, (w0 >> 30) & 0x3f, parm);\ - DST(op,i*32+ 6, (w0 >> 36) & 0x3f, parm);\ - DST(op,i*32+ 7, (w0 >> 42) & 0x3f, parm);\ - DST(op,i*32+ 8, (w0 >> 48) & 0x3f, parm);\ - DST(op,i*32+ 9, (w0 >> 54) & 0x3f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w0 >> 60) | (w1 << 4) & 0x3f, parm);\ - DST(op,i*32+11, (w1 >> 2) & 0x3f, parm);\ - DST(op,i*32+12, (w1 >> 8) & 0x3f, parm);\ - DST(op,i*32+13, (w1 >> 14) & 0x3f, parm);\ - DST(op,i*32+14, (w1 >> 20) & 0x3f, parm);\ - DST(op,i*32+15, (w1 >> 26) & 0x3f, parm);\ - DST(op,i*32+16, (w1 >> 32) & 0x3f, parm);\ - DST(op,i*32+17, (w1 >> 38) & 0x3f, parm);\ - DST(op,i*32+18, (w1 >> 44) & 0x3f, parm);\ - DST(op,i*32+19, (w1 >> 50) & 0x3f, parm);\ - DST(op,i*32+20, (w1 >> 56) & 0x3f, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w1 >> 62) | (w2 << 2) & 0x3f, parm);\ - DST(op,i*32+22, (w2 >> 4) & 0x3f, parm);\ - DST(op,i*32+23, (w2 >> 10) & 0x3f, parm);\ - DST(op,i*32+24, (w2 >> 16) & 0x3f, parm);\ - DST(op,i*32+25, (w2 >> 22) & 0x3f, parm);\ - DST(op,i*32+26, (w2 >> 28) & 0x3f, parm);\ - DST(op,i*32+27, (w2 >> 34) & 0x3f, parm);\ - DST(op,i*32+28, (w2 >> 40) & 0x3f, parm);\ - DST(op,i*32+29, (w2 >> 46) & 0x3f, parm);\ - DST(op,i*32+30, (w2 >> 52) & 0x3f, parm);\ - DST(op,i*32+31, (w2 >> 58) , parm);;\ -} - -#define BITUNPACK64_6(ip, op, parm) { \ - BITUNBLK64_6(ip, 0, op, parm); DSTI(op); ip += 6*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_7(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7f, parm);\ - DST(op,i*64+ 1, (w0 >> 7) & 0x7f, parm);\ - DST(op,i*64+ 2, (w0 >> 14) & 0x7f, parm);\ - DST(op,i*64+ 3, (w0 >> 21) & 0x7f, parm);\ - DST(op,i*64+ 4, (w0 >> 28) & 0x7f, parm);\ - DST(op,i*64+ 5, (w0 >> 35) & 0x7f, parm);\ - DST(op,i*64+ 6, (w0 >> 42) & 0x7f, parm);\ - DST(op,i*64+ 7, (w0 >> 49) & 0x7f, parm);\ - DST(op,i*64+ 8, (w0 >> 56) & 0x7f, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w0 >> 63) | (w1 << 1) & 0x7f, parm);\ - DST(op,i*64+10, (w1 >> 6) & 0x7f, parm);\ - DST(op,i*64+11, (w1 >> 13) & 0x7f, parm);\ - DST(op,i*64+12, (w1 >> 20) & 0x7f, parm);\ - DST(op,i*64+13, (w1 >> 27) & 0x7f, parm);\ - DST(op,i*64+14, (w1 >> 34) & 0x7f, parm);\ - DST(op,i*64+15, (w1 >> 41) & 0x7f, parm);\ - DST(op,i*64+16, (w1 >> 48) & 0x7f, parm);\ - DST(op,i*64+17, (w1 >> 55) & 0x7f, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w1 >> 62) | (w2 << 2) & 0x7f, parm);\ - DST(op,i*64+19, (w2 >> 5) & 0x7f, parm);\ - DST(op,i*64+20, (w2 >> 12) & 0x7f, parm);\ - DST(op,i*64+21, (w2 >> 19) & 0x7f, parm);\ - DST(op,i*64+22, (w2 >> 26) & 0x7f, parm);\ - DST(op,i*64+23, (w2 >> 33) & 0x7f, parm);\ - DST(op,i*64+24, (w2 >> 40) & 0x7f, parm);\ - DST(op,i*64+25, (w2 >> 47) & 0x7f, parm);\ - DST(op,i*64+26, (w2 >> 54) & 0x7f, parm); register uint64_t w3 = *(uint32_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w2 >> 61) | (w3 << 3) & 0x7f, parm);\ - DST(op,i*64+28, (w3 >> 4) & 0x7f, parm);\ - DST(op,i*64+29, (w3 >> 11) & 0x7f, parm);\ - DST(op,i*64+30, (w3 >> 18) & 0x7f, parm);\ - DST(op,i*64+31, (w3 >> 25) & 0x7f, parm);;\ -} - -#define BITUNPACK64_7(ip, op, parm) { \ - BITUNBLK64_7(ip, 0, op, parm); DSTI(op); ip += 7*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_8(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ - DST(op,i*8+ 0, (w0 ) & 0xff, parm);\ - DST(op,i*8+ 1, (w0 >> 8) & 0xff, parm);\ - DST(op,i*8+ 2, (w0 >> 16) & 0xff, parm);\ - DST(op,i*8+ 3, (w0 >> 24) & 0xff, parm);\ - DST(op,i*8+ 4, (w0 >> 32) & 0xff, parm);\ - DST(op,i*8+ 5, (w0 >> 40) & 0xff, parm);\ - DST(op,i*8+ 6, (w0 >> 48) & 0xff, parm);\ - DST(op,i*8+ 7, (w0 >> 56) , parm);;\ -} - -#define BITUNPACK64_8(ip, op, parm) { \ - BITUNBLK64_8(ip, 0, op, parm);\ - BITUNBLK64_8(ip, 1, op, parm);\ - BITUNBLK64_8(ip, 2, op, parm);\ - BITUNBLK64_8(ip, 3, op, parm); DSTI(op); ip += 8*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_9(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ff, parm);\ - DST(op,i*64+ 1, (w0 >> 9) & 0x1ff, parm);\ - DST(op,i*64+ 2, (w0 >> 18) & 0x1ff, parm);\ - DST(op,i*64+ 3, (w0 >> 27) & 0x1ff, parm);\ - DST(op,i*64+ 4, (w0 >> 36) & 0x1ff, parm);\ - DST(op,i*64+ 5, (w0 >> 45) & 0x1ff, parm);\ - DST(op,i*64+ 6, (w0 >> 54) & 0x1ff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w0 >> 63) | (w1 << 1) & 0x1ff, parm);\ - DST(op,i*64+ 8, (w1 >> 8) & 0x1ff, parm);\ - DST(op,i*64+ 9, (w1 >> 17) & 0x1ff, parm);\ - DST(op,i*64+10, (w1 >> 26) & 0x1ff, parm);\ - DST(op,i*64+11, (w1 >> 35) & 0x1ff, parm);\ - DST(op,i*64+12, (w1 >> 44) & 0x1ff, parm);\ - DST(op,i*64+13, (w1 >> 53) & 0x1ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w1 >> 62) | (w2 << 2) & 0x1ff, parm);\ - DST(op,i*64+15, (w2 >> 7) & 0x1ff, parm);\ - DST(op,i*64+16, (w2 >> 16) & 0x1ff, parm);\ - DST(op,i*64+17, (w2 >> 25) & 0x1ff, parm);\ - DST(op,i*64+18, (w2 >> 34) & 0x1ff, parm);\ - DST(op,i*64+19, (w2 >> 43) & 0x1ff, parm);\ - DST(op,i*64+20, (w2 >> 52) & 0x1ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w2 >> 61) | (w3 << 3) & 0x1ff, parm);\ - DST(op,i*64+22, (w3 >> 6) & 0x1ff, parm);\ - DST(op,i*64+23, (w3 >> 15) & 0x1ff, parm);\ - DST(op,i*64+24, (w3 >> 24) & 0x1ff, parm);\ - DST(op,i*64+25, (w3 >> 33) & 0x1ff, parm);\ - DST(op,i*64+26, (w3 >> 42) & 0x1ff, parm);\ - DST(op,i*64+27, (w3 >> 51) & 0x1ff, parm); register uint64_t w4 = *(uint32_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w3 >> 60) | (w4 << 4) & 0x1ff, parm);\ - DST(op,i*64+29, (w4 >> 5) & 0x1ff, parm);\ - DST(op,i*64+30, (w4 >> 14) & 0x1ff, parm);\ - DST(op,i*64+31, (w4 >> 23) & 0x1ff, parm);;\ -} - -#define BITUNPACK64_9(ip, op, parm) { \ - BITUNBLK64_9(ip, 0, op, parm); DSTI(op); ip += 9*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_10(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ff, parm);\ - DST(op,i*32+ 1, (w0 >> 10) & 0x3ff, parm);\ - DST(op,i*32+ 2, (w0 >> 20) & 0x3ff, parm);\ - DST(op,i*32+ 3, (w0 >> 30) & 0x3ff, parm);\ - DST(op,i*32+ 4, (w0 >> 40) & 0x3ff, parm);\ - DST(op,i*32+ 5, (w0 >> 50) & 0x3ff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w0 >> 60) | (w1 << 4) & 0x3ff, parm);\ - DST(op,i*32+ 7, (w1 >> 6) & 0x3ff, parm);\ - DST(op,i*32+ 8, (w1 >> 16) & 0x3ff, parm);\ - DST(op,i*32+ 9, (w1 >> 26) & 0x3ff, parm);\ - DST(op,i*32+10, (w1 >> 36) & 0x3ff, parm);\ - DST(op,i*32+11, (w1 >> 46) & 0x3ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w1 >> 56) | (w2 << 8) & 0x3ff, parm);\ - DST(op,i*32+13, (w2 >> 2) & 0x3ff, parm);\ - DST(op,i*32+14, (w2 >> 12) & 0x3ff, parm);\ - DST(op,i*32+15, (w2 >> 22) & 0x3ff, parm);\ - DST(op,i*32+16, (w2 >> 32) & 0x3ff, parm);\ - DST(op,i*32+17, (w2 >> 42) & 0x3ff, parm);\ - DST(op,i*32+18, (w2 >> 52) & 0x3ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w2 >> 62) | (w3 << 2) & 0x3ff, parm);\ - DST(op,i*32+20, (w3 >> 8) & 0x3ff, parm);\ - DST(op,i*32+21, (w3 >> 18) & 0x3ff, parm);\ - DST(op,i*32+22, (w3 >> 28) & 0x3ff, parm);\ - DST(op,i*32+23, (w3 >> 38) & 0x3ff, parm);\ - DST(op,i*32+24, (w3 >> 48) & 0x3ff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w3 >> 58) | (w4 << 6) & 0x3ff, parm);\ - DST(op,i*32+26, (w4 >> 4) & 0x3ff, parm);\ - DST(op,i*32+27, (w4 >> 14) & 0x3ff, parm);\ - DST(op,i*32+28, (w4 >> 24) & 0x3ff, parm);\ - DST(op,i*32+29, (w4 >> 34) & 0x3ff, parm);\ - DST(op,i*32+30, (w4 >> 44) & 0x3ff, parm);\ - DST(op,i*32+31, (w4 >> 54) , parm);;\ -} - -#define BITUNPACK64_10(ip, op, parm) { \ - BITUNBLK64_10(ip, 0, op, parm); DSTI(op); ip += 10*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_11(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ff, parm);\ - DST(op,i*64+ 1, (w0 >> 11) & 0x7ff, parm);\ - DST(op,i*64+ 2, (w0 >> 22) & 0x7ff, parm);\ - DST(op,i*64+ 3, (w0 >> 33) & 0x7ff, parm);\ - DST(op,i*64+ 4, (w0 >> 44) & 0x7ff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w0 >> 55) | (w1 << 9) & 0x7ff, parm);\ - DST(op,i*64+ 6, (w1 >> 2) & 0x7ff, parm);\ - DST(op,i*64+ 7, (w1 >> 13) & 0x7ff, parm);\ - DST(op,i*64+ 8, (w1 >> 24) & 0x7ff, parm);\ - DST(op,i*64+ 9, (w1 >> 35) & 0x7ff, parm);\ - DST(op,i*64+10, (w1 >> 46) & 0x7ff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w1 >> 57) | (w2 << 7) & 0x7ff, parm);\ - DST(op,i*64+12, (w2 >> 4) & 0x7ff, parm);\ - DST(op,i*64+13, (w2 >> 15) & 0x7ff, parm);\ - DST(op,i*64+14, (w2 >> 26) & 0x7ff, parm);\ - DST(op,i*64+15, (w2 >> 37) & 0x7ff, parm);\ - DST(op,i*64+16, (w2 >> 48) & 0x7ff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w2 >> 59) | (w3 << 5) & 0x7ff, parm);\ - DST(op,i*64+18, (w3 >> 6) & 0x7ff, parm);\ - DST(op,i*64+19, (w3 >> 17) & 0x7ff, parm);\ - DST(op,i*64+20, (w3 >> 28) & 0x7ff, parm);\ - DST(op,i*64+21, (w3 >> 39) & 0x7ff, parm);\ - DST(op,i*64+22, (w3 >> 50) & 0x7ff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w3 >> 61) | (w4 << 3) & 0x7ff, parm);\ - DST(op,i*64+24, (w4 >> 8) & 0x7ff, parm);\ - DST(op,i*64+25, (w4 >> 19) & 0x7ff, parm);\ - DST(op,i*64+26, (w4 >> 30) & 0x7ff, parm);\ - DST(op,i*64+27, (w4 >> 41) & 0x7ff, parm);\ - DST(op,i*64+28, (w4 >> 52) & 0x7ff, parm); register uint64_t w5 = *(uint32_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w4 >> 63) | (w5 << 1) & 0x7ff, parm);\ - DST(op,i*64+30, (w5 >> 10) & 0x7ff, parm);\ - DST(op,i*64+31, (w5 >> 21) & 0x7ff, parm);;\ -} - -#define BITUNPACK64_11(ip, op, parm) { \ - BITUNBLK64_11(ip, 0, op, parm); DSTI(op); ip += 11*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_12(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfff, parm);\ - DST(op,i*16+ 1, (w0 >> 12) & 0xfff, parm);\ - DST(op,i*16+ 2, (w0 >> 24) & 0xfff, parm);\ - DST(op,i*16+ 3, (w0 >> 36) & 0xfff, parm);\ - DST(op,i*16+ 4, (w0 >> 48) & 0xfff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 5, (w0 >> 60) | (w1 << 4) & 0xfff, parm);\ - DST(op,i*16+ 6, (w1 >> 8) & 0xfff, parm);\ - DST(op,i*16+ 7, (w1 >> 20) & 0xfff, parm);\ - DST(op,i*16+ 8, (w1 >> 32) & 0xfff, parm);\ - DST(op,i*16+ 9, (w1 >> 44) & 0xfff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+10, (w1 >> 56) | (w2 << 8) & 0xfff, parm);\ - DST(op,i*16+11, (w2 >> 4) & 0xfff, parm);\ - DST(op,i*16+12, (w2 >> 16) & 0xfff, parm);\ - DST(op,i*16+13, (w2 >> 28) & 0xfff, parm);\ - DST(op,i*16+14, (w2 >> 40) & 0xfff, parm);\ - DST(op,i*16+15, (w2 >> 52) , parm);;\ -} - -#define BITUNPACK64_12(ip, op, parm) { \ - BITUNBLK64_12(ip, 0, op, parm);\ - BITUNBLK64_12(ip, 1, op, parm); DSTI(op); ip += 12*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_13(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fff, parm);\ - DST(op,i*64+ 1, (w0 >> 13) & 0x1fff, parm);\ - DST(op,i*64+ 2, (w0 >> 26) & 0x1fff, parm);\ - DST(op,i*64+ 3, (w0 >> 39) & 0x1fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w0 >> 52) | (w1 << 12) & 0x1fff, parm);\ - DST(op,i*64+ 5, (w1 >> 1) & 0x1fff, parm);\ - DST(op,i*64+ 6, (w1 >> 14) & 0x1fff, parm);\ - DST(op,i*64+ 7, (w1 >> 27) & 0x1fff, parm);\ - DST(op,i*64+ 8, (w1 >> 40) & 0x1fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w1 >> 53) | (w2 << 11) & 0x1fff, parm);\ - DST(op,i*64+10, (w2 >> 2) & 0x1fff, parm);\ - DST(op,i*64+11, (w2 >> 15) & 0x1fff, parm);\ - DST(op,i*64+12, (w2 >> 28) & 0x1fff, parm);\ - DST(op,i*64+13, (w2 >> 41) & 0x1fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w2 >> 54) | (w3 << 10) & 0x1fff, parm);\ - DST(op,i*64+15, (w3 >> 3) & 0x1fff, parm);\ - DST(op,i*64+16, (w3 >> 16) & 0x1fff, parm);\ - DST(op,i*64+17, (w3 >> 29) & 0x1fff, parm);\ - DST(op,i*64+18, (w3 >> 42) & 0x1fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w3 >> 55) | (w4 << 9) & 0x1fff, parm);\ - DST(op,i*64+20, (w4 >> 4) & 0x1fff, parm);\ - DST(op,i*64+21, (w4 >> 17) & 0x1fff, parm);\ - DST(op,i*64+22, (w4 >> 30) & 0x1fff, parm);\ - DST(op,i*64+23, (w4 >> 43) & 0x1fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w4 >> 56) | (w5 << 8) & 0x1fff, parm);\ - DST(op,i*64+25, (w5 >> 5) & 0x1fff, parm);\ - DST(op,i*64+26, (w5 >> 18) & 0x1fff, parm);\ - DST(op,i*64+27, (w5 >> 31) & 0x1fff, parm);\ - DST(op,i*64+28, (w5 >> 44) & 0x1fff, parm); register uint64_t w6 = *(uint32_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w5 >> 57) | (w6 << 7) & 0x1fff, parm);\ - DST(op,i*64+30, (w6 >> 6) & 0x1fff, parm);\ - DST(op,i*64+31, (w6 >> 19) & 0x1fff, parm);;\ -} - -#define BITUNPACK64_13(ip, op, parm) { \ - BITUNBLK64_13(ip, 0, op, parm); DSTI(op); ip += 13*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_14(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fff, parm);\ - DST(op,i*32+ 1, (w0 >> 14) & 0x3fff, parm);\ - DST(op,i*32+ 2, (w0 >> 28) & 0x3fff, parm);\ - DST(op,i*32+ 3, (w0 >> 42) & 0x3fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w0 >> 56) | (w1 << 8) & 0x3fff, parm);\ - DST(op,i*32+ 5, (w1 >> 6) & 0x3fff, parm);\ - DST(op,i*32+ 6, (w1 >> 20) & 0x3fff, parm);\ - DST(op,i*32+ 7, (w1 >> 34) & 0x3fff, parm);\ - DST(op,i*32+ 8, (w1 >> 48) & 0x3fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w1 >> 62) | (w2 << 2) & 0x3fff, parm);\ - DST(op,i*32+10, (w2 >> 12) & 0x3fff, parm);\ - DST(op,i*32+11, (w2 >> 26) & 0x3fff, parm);\ - DST(op,i*32+12, (w2 >> 40) & 0x3fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w2 >> 54) | (w3 << 10) & 0x3fff, parm);\ - DST(op,i*32+14, (w3 >> 4) & 0x3fff, parm);\ - DST(op,i*32+15, (w3 >> 18) & 0x3fff, parm);\ - DST(op,i*32+16, (w3 >> 32) & 0x3fff, parm);\ - DST(op,i*32+17, (w3 >> 46) & 0x3fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w3 >> 60) | (w4 << 4) & 0x3fff, parm);\ - DST(op,i*32+19, (w4 >> 10) & 0x3fff, parm);\ - DST(op,i*32+20, (w4 >> 24) & 0x3fff, parm);\ - DST(op,i*32+21, (w4 >> 38) & 0x3fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w4 >> 52) | (w5 << 12) & 0x3fff, parm);\ - DST(op,i*32+23, (w5 >> 2) & 0x3fff, parm);\ - DST(op,i*32+24, (w5 >> 16) & 0x3fff, parm);\ - DST(op,i*32+25, (w5 >> 30) & 0x3fff, parm);\ - DST(op,i*32+26, (w5 >> 44) & 0x3fff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w5 >> 58) | (w6 << 6) & 0x3fff, parm);\ - DST(op,i*32+28, (w6 >> 8) & 0x3fff, parm);\ - DST(op,i*32+29, (w6 >> 22) & 0x3fff, parm);\ - DST(op,i*32+30, (w6 >> 36) & 0x3fff, parm);\ - DST(op,i*32+31, (w6 >> 50) , parm);;\ -} - -#define BITUNPACK64_14(ip, op, parm) { \ - BITUNBLK64_14(ip, 0, op, parm); DSTI(op); ip += 14*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_15(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fff, parm);\ - DST(op,i*64+ 1, (w0 >> 15) & 0x7fff, parm);\ - DST(op,i*64+ 2, (w0 >> 30) & 0x7fff, parm);\ - DST(op,i*64+ 3, (w0 >> 45) & 0x7fff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w0 >> 60) | (w1 << 4) & 0x7fff, parm);\ - DST(op,i*64+ 5, (w1 >> 11) & 0x7fff, parm);\ - DST(op,i*64+ 6, (w1 >> 26) & 0x7fff, parm);\ - DST(op,i*64+ 7, (w1 >> 41) & 0x7fff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w1 >> 56) | (w2 << 8) & 0x7fff, parm);\ - DST(op,i*64+ 9, (w2 >> 7) & 0x7fff, parm);\ - DST(op,i*64+10, (w2 >> 22) & 0x7fff, parm);\ - DST(op,i*64+11, (w2 >> 37) & 0x7fff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w2 >> 52) | (w3 << 12) & 0x7fff, parm);\ - DST(op,i*64+13, (w3 >> 3) & 0x7fff, parm);\ - DST(op,i*64+14, (w3 >> 18) & 0x7fff, parm);\ - DST(op,i*64+15, (w3 >> 33) & 0x7fff, parm);\ - DST(op,i*64+16, (w3 >> 48) & 0x7fff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w3 >> 63) | (w4 << 1) & 0x7fff, parm);\ - DST(op,i*64+18, (w4 >> 14) & 0x7fff, parm);\ - DST(op,i*64+19, (w4 >> 29) & 0x7fff, parm);\ - DST(op,i*64+20, (w4 >> 44) & 0x7fff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w4 >> 59) | (w5 << 5) & 0x7fff, parm);\ - DST(op,i*64+22, (w5 >> 10) & 0x7fff, parm);\ - DST(op,i*64+23, (w5 >> 25) & 0x7fff, parm);\ - DST(op,i*64+24, (w5 >> 40) & 0x7fff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w5 >> 55) | (w6 << 9) & 0x7fff, parm);\ - DST(op,i*64+26, (w6 >> 6) & 0x7fff, parm);\ - DST(op,i*64+27, (w6 >> 21) & 0x7fff, parm);\ - DST(op,i*64+28, (w6 >> 36) & 0x7fff, parm); register uint64_t w7 = *(uint32_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w6 >> 51) | (w7 << 13) & 0x7fff, parm);\ - DST(op,i*64+30, (w7 >> 2) & 0x7fff, parm);\ - DST(op,i*64+31, (w7 >> 17) & 0x7fff, parm);;\ -} - -#define BITUNPACK64_15(ip, op, parm) { \ - BITUNBLK64_15(ip, 0, op, parm); DSTI(op); ip += 15*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_16(ip, i, op, parm) { \ - DST(op,i*4+ 0, *(uint16_t *)(ip+i*8+ 0), parm);\ - DST(op,i*4+ 1, *(uint16_t *)(ip+i*8+ 2), parm);\ - DST(op,i*4+ 2, *(uint16_t *)(ip+i*8+ 4), parm);\ - DST(op,i*4+ 3, *(uint16_t *)(ip+i*8+ 6), parm);;\ -} - -#define BITUNPACK64_16(ip, op, parm) { \ - BITUNBLK64_16(ip, 0, op, parm);\ - BITUNBLK64_16(ip, 1, op, parm);\ - BITUNBLK64_16(ip, 2, op, parm);\ - BITUNBLK64_16(ip, 3, op, parm);\ - BITUNBLK64_16(ip, 4, op, parm);\ - BITUNBLK64_16(ip, 5, op, parm);\ - BITUNBLK64_16(ip, 6, op, parm);\ - BITUNBLK64_16(ip, 7, op, parm); DSTI(op); ip += 16*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_17(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*17+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ffff, parm);\ - DST(op,i*64+ 1, (w0 >> 17) & 0x1ffff, parm);\ - DST(op,i*64+ 2, (w0 >> 34) & 0x1ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*17+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w0 >> 51) | (w1 << 13) & 0x1ffff, parm);\ - DST(op,i*64+ 4, (w1 >> 4) & 0x1ffff, parm);\ - DST(op,i*64+ 5, (w1 >> 21) & 0x1ffff, parm);\ - DST(op,i*64+ 6, (w1 >> 38) & 0x1ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*17+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w1 >> 55) | (w2 << 9) & 0x1ffff, parm);\ - DST(op,i*64+ 8, (w2 >> 8) & 0x1ffff, parm);\ - DST(op,i*64+ 9, (w2 >> 25) & 0x1ffff, parm);\ - DST(op,i*64+10, (w2 >> 42) & 0x1ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*17+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w2 >> 59) | (w3 << 5) & 0x1ffff, parm);\ - DST(op,i*64+12, (w3 >> 12) & 0x1ffff, parm);\ - DST(op,i*64+13, (w3 >> 29) & 0x1ffff, parm);\ - DST(op,i*64+14, (w3 >> 46) & 0x1ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*17+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w3 >> 63) | (w4 << 1) & 0x1ffff, parm);\ - DST(op,i*64+16, (w4 >> 16) & 0x1ffff, parm);\ - DST(op,i*64+17, (w4 >> 33) & 0x1ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*17+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w4 >> 50) | (w5 << 14) & 0x1ffff, parm);\ - DST(op,i*64+19, (w5 >> 3) & 0x1ffff, parm);\ - DST(op,i*64+20, (w5 >> 20) & 0x1ffff, parm);\ - DST(op,i*64+21, (w5 >> 37) & 0x1ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*17+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w5 >> 54) | (w6 << 10) & 0x1ffff, parm);\ - DST(op,i*64+23, (w6 >> 7) & 0x1ffff, parm);\ - DST(op,i*64+24, (w6 >> 24) & 0x1ffff, parm);\ - DST(op,i*64+25, (w6 >> 41) & 0x1ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*17+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w6 >> 58) | (w7 << 6) & 0x1ffff, parm);\ - DST(op,i*64+27, (w7 >> 11) & 0x1ffff, parm);\ - DST(op,i*64+28, (w7 >> 28) & 0x1ffff, parm);\ - DST(op,i*64+29, (w7 >> 45) & 0x1ffff, parm); register uint64_t w8 = *(uint32_t *)(ip+(i*17+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w7 >> 62) | (w8 << 2) & 0x1ffff, parm);\ - DST(op,i*64+31, (w8 >> 15) & 0x1ffff, parm);;\ -} - -#define BITUNPACK64_17(ip, op, parm) { \ - BITUNBLK64_17(ip, 0, op, parm); DSTI(op); ip += 17*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_18(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ffff, parm);\ - DST(op,i*32+ 1, (w0 >> 18) & 0x3ffff, parm);\ - DST(op,i*32+ 2, (w0 >> 36) & 0x3ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w0 >> 54) | (w1 << 10) & 0x3ffff, parm);\ - DST(op,i*32+ 4, (w1 >> 8) & 0x3ffff, parm);\ - DST(op,i*32+ 5, (w1 >> 26) & 0x3ffff, parm);\ - DST(op,i*32+ 6, (w1 >> 44) & 0x3ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w1 >> 62) | (w2 << 2) & 0x3ffff, parm);\ - DST(op,i*32+ 8, (w2 >> 16) & 0x3ffff, parm);\ - DST(op,i*32+ 9, (w2 >> 34) & 0x3ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w2 >> 52) | (w3 << 12) & 0x3ffff, parm);\ - DST(op,i*32+11, (w3 >> 6) & 0x3ffff, parm);\ - DST(op,i*32+12, (w3 >> 24) & 0x3ffff, parm);\ - DST(op,i*32+13, (w3 >> 42) & 0x3ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w3 >> 60) | (w4 << 4) & 0x3ffff, parm);\ - DST(op,i*32+15, (w4 >> 14) & 0x3ffff, parm);\ - DST(op,i*32+16, (w4 >> 32) & 0x3ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*9+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w4 >> 50) | (w5 << 14) & 0x3ffff, parm);\ - DST(op,i*32+18, (w5 >> 4) & 0x3ffff, parm);\ - DST(op,i*32+19, (w5 >> 22) & 0x3ffff, parm);\ - DST(op,i*32+20, (w5 >> 40) & 0x3ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*9+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w5 >> 58) | (w6 << 6) & 0x3ffff, parm);\ - DST(op,i*32+22, (w6 >> 12) & 0x3ffff, parm);\ - DST(op,i*32+23, (w6 >> 30) & 0x3ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*9+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w6 >> 48) | (w7 << 16) & 0x3ffff, parm);\ - DST(op,i*32+25, (w7 >> 2) & 0x3ffff, parm);\ - DST(op,i*32+26, (w7 >> 20) & 0x3ffff, parm);\ - DST(op,i*32+27, (w7 >> 38) & 0x3ffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*9+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w7 >> 56) | (w8 << 8) & 0x3ffff, parm);\ - DST(op,i*32+29, (w8 >> 10) & 0x3ffff, parm);\ - DST(op,i*32+30, (w8 >> 28) & 0x3ffff, parm);\ - DST(op,i*32+31, (w8 >> 46) , parm);;\ -} - -#define BITUNPACK64_18(ip, op, parm) { \ - BITUNBLK64_18(ip, 0, op, parm); DSTI(op); ip += 18*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_19(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*19+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ffff, parm);\ - DST(op,i*64+ 1, (w0 >> 19) & 0x7ffff, parm);\ - DST(op,i*64+ 2, (w0 >> 38) & 0x7ffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*19+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w0 >> 57) | (w1 << 7) & 0x7ffff, parm);\ - DST(op,i*64+ 4, (w1 >> 12) & 0x7ffff, parm);\ - DST(op,i*64+ 5, (w1 >> 31) & 0x7ffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*19+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w1 >> 50) | (w2 << 14) & 0x7ffff, parm);\ - DST(op,i*64+ 7, (w2 >> 5) & 0x7ffff, parm);\ - DST(op,i*64+ 8, (w2 >> 24) & 0x7ffff, parm);\ - DST(op,i*64+ 9, (w2 >> 43) & 0x7ffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*19+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w2 >> 62) | (w3 << 2) & 0x7ffff, parm);\ - DST(op,i*64+11, (w3 >> 17) & 0x7ffff, parm);\ - DST(op,i*64+12, (w3 >> 36) & 0x7ffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*19+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w3 >> 55) | (w4 << 9) & 0x7ffff, parm);\ - DST(op,i*64+14, (w4 >> 10) & 0x7ffff, parm);\ - DST(op,i*64+15, (w4 >> 29) & 0x7ffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*19+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w4 >> 48) | (w5 << 16) & 0x7ffff, parm);\ - DST(op,i*64+17, (w5 >> 3) & 0x7ffff, parm);\ - DST(op,i*64+18, (w5 >> 22) & 0x7ffff, parm);\ - DST(op,i*64+19, (w5 >> 41) & 0x7ffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*19+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w5 >> 60) | (w6 << 4) & 0x7ffff, parm);\ - DST(op,i*64+21, (w6 >> 15) & 0x7ffff, parm);\ - DST(op,i*64+22, (w6 >> 34) & 0x7ffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*19+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w6 >> 53) | (w7 << 11) & 0x7ffff, parm);\ - DST(op,i*64+24, (w7 >> 8) & 0x7ffff, parm);\ - DST(op,i*64+25, (w7 >> 27) & 0x7ffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*19+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w7 >> 46) | (w8 << 18) & 0x7ffff, parm);\ - DST(op,i*64+27, (w8 >> 1) & 0x7ffff, parm);\ - DST(op,i*64+28, (w8 >> 20) & 0x7ffff, parm);\ - DST(op,i*64+29, (w8 >> 39) & 0x7ffff, parm); register uint64_t w9 = *(uint32_t *)(ip+(i*19+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w8 >> 58) | (w9 << 6) & 0x7ffff, parm);\ - DST(op,i*64+31, (w9 >> 13) & 0x7ffff, parm);;\ -} - -#define BITUNPACK64_19(ip, op, parm) { \ - BITUNBLK64_19(ip, 0, op, parm); DSTI(op); ip += 19*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_20(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfffff, parm);\ - DST(op,i*16+ 1, (w0 >> 20) & 0xfffff, parm);\ - DST(op,i*16+ 2, (w0 >> 40) & 0xfffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 3, (w0 >> 60) | (w1 << 4) & 0xfffff, parm);\ - DST(op,i*16+ 4, (w1 >> 16) & 0xfffff, parm);\ - DST(op,i*16+ 5, (w1 >> 36) & 0xfffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 6, (w1 >> 56) | (w2 << 8) & 0xfffff, parm);\ - DST(op,i*16+ 7, (w2 >> 12) & 0xfffff, parm);\ - DST(op,i*16+ 8, (w2 >> 32) & 0xfffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 9, (w2 >> 52) | (w3 << 12) & 0xfffff, parm);\ - DST(op,i*16+10, (w3 >> 8) & 0xfffff, parm);\ - DST(op,i*16+11, (w3 >> 28) & 0xfffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+12, (w3 >> 48) | (w4 << 16) & 0xfffff, parm);\ - DST(op,i*16+13, (w4 >> 4) & 0xfffff, parm);\ - DST(op,i*16+14, (w4 >> 24) & 0xfffff, parm);\ - DST(op,i*16+15, (w4 >> 44) , parm);;\ -} - -#define BITUNPACK64_20(ip, op, parm) { \ - BITUNBLK64_20(ip, 0, op, parm);\ - BITUNBLK64_20(ip, 1, op, parm); DSTI(op); ip += 20*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_21(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*21+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fffff, parm);\ - DST(op,i*64+ 1, (w0 >> 21) & 0x1fffff, parm);\ - DST(op,i*64+ 2, (w0 >> 42) & 0x1fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*21+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w0 >> 63) | (w1 << 1) & 0x1fffff, parm);\ - DST(op,i*64+ 4, (w1 >> 20) & 0x1fffff, parm);\ - DST(op,i*64+ 5, (w1 >> 41) & 0x1fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*21+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w1 >> 62) | (w2 << 2) & 0x1fffff, parm);\ - DST(op,i*64+ 7, (w2 >> 19) & 0x1fffff, parm);\ - DST(op,i*64+ 8, (w2 >> 40) & 0x1fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*21+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w2 >> 61) | (w3 << 3) & 0x1fffff, parm);\ - DST(op,i*64+10, (w3 >> 18) & 0x1fffff, parm);\ - DST(op,i*64+11, (w3 >> 39) & 0x1fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*21+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w3 >> 60) | (w4 << 4) & 0x1fffff, parm);\ - DST(op,i*64+13, (w4 >> 17) & 0x1fffff, parm);\ - DST(op,i*64+14, (w4 >> 38) & 0x1fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*21+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w4 >> 59) | (w5 << 5) & 0x1fffff, parm);\ - DST(op,i*64+16, (w5 >> 16) & 0x1fffff, parm);\ - DST(op,i*64+17, (w5 >> 37) & 0x1fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*21+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w5 >> 58) | (w6 << 6) & 0x1fffff, parm);\ - DST(op,i*64+19, (w6 >> 15) & 0x1fffff, parm);\ - DST(op,i*64+20, (w6 >> 36) & 0x1fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*21+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w6 >> 57) | (w7 << 7) & 0x1fffff, parm);\ - DST(op,i*64+22, (w7 >> 14) & 0x1fffff, parm);\ - DST(op,i*64+23, (w7 >> 35) & 0x1fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*21+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w7 >> 56) | (w8 << 8) & 0x1fffff, parm);\ - DST(op,i*64+25, (w8 >> 13) & 0x1fffff, parm);\ - DST(op,i*64+26, (w8 >> 34) & 0x1fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*21+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w8 >> 55) | (w9 << 9) & 0x1fffff, parm);\ - DST(op,i*64+28, (w9 >> 12) & 0x1fffff, parm);\ - DST(op,i*64+29, (w9 >> 33) & 0x1fffff, parm); register uint64_t w10 = *(uint32_t *)(ip+(i*21+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w9 >> 54) | (w10 << 10) & 0x1fffff, parm);\ - DST(op,i*64+31, (w10 >> 11) & 0x1fffff, parm);;\ -} - -#define BITUNPACK64_21(ip, op, parm) { \ - BITUNBLK64_21(ip, 0, op, parm); DSTI(op); ip += 21*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_22(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fffff, parm);\ - DST(op,i*32+ 1, (w0 >> 22) & 0x3fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w0 >> 44) | (w1 << 20) & 0x3fffff, parm);\ - DST(op,i*32+ 3, (w1 >> 2) & 0x3fffff, parm);\ - DST(op,i*32+ 4, (w1 >> 24) & 0x3fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w1 >> 46) | (w2 << 18) & 0x3fffff, parm);\ - DST(op,i*32+ 6, (w2 >> 4) & 0x3fffff, parm);\ - DST(op,i*32+ 7, (w2 >> 26) & 0x3fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w2 >> 48) | (w3 << 16) & 0x3fffff, parm);\ - DST(op,i*32+ 9, (w3 >> 6) & 0x3fffff, parm);\ - DST(op,i*32+10, (w3 >> 28) & 0x3fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w3 >> 50) | (w4 << 14) & 0x3fffff, parm);\ - DST(op,i*32+12, (w4 >> 8) & 0x3fffff, parm);\ - DST(op,i*32+13, (w4 >> 30) & 0x3fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w4 >> 52) | (w5 << 12) & 0x3fffff, parm);\ - DST(op,i*32+15, (w5 >> 10) & 0x3fffff, parm);\ - DST(op,i*32+16, (w5 >> 32) & 0x3fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*11+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w5 >> 54) | (w6 << 10) & 0x3fffff, parm);\ - DST(op,i*32+18, (w6 >> 12) & 0x3fffff, parm);\ - DST(op,i*32+19, (w6 >> 34) & 0x3fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*11+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w6 >> 56) | (w7 << 8) & 0x3fffff, parm);\ - DST(op,i*32+21, (w7 >> 14) & 0x3fffff, parm);\ - DST(op,i*32+22, (w7 >> 36) & 0x3fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*11+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w7 >> 58) | (w8 << 6) & 0x3fffff, parm);\ - DST(op,i*32+24, (w8 >> 16) & 0x3fffff, parm);\ - DST(op,i*32+25, (w8 >> 38) & 0x3fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*11+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w8 >> 60) | (w9 << 4) & 0x3fffff, parm);\ - DST(op,i*32+27, (w9 >> 18) & 0x3fffff, parm);\ - DST(op,i*32+28, (w9 >> 40) & 0x3fffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*11+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w9 >> 62) | (w10 << 2) & 0x3fffff, parm);\ - DST(op,i*32+30, (w10 >> 20) & 0x3fffff, parm);\ - DST(op,i*32+31, (w10 >> 42) , parm);;\ -} - -#define BITUNPACK64_22(ip, op, parm) { \ - BITUNBLK64_22(ip, 0, op, parm); DSTI(op); ip += 22*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_23(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*23+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fffff, parm);\ - DST(op,i*64+ 1, (w0 >> 23) & 0x7fffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*23+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w0 >> 46) | (w1 << 18) & 0x7fffff, parm);\ - DST(op,i*64+ 3, (w1 >> 5) & 0x7fffff, parm);\ - DST(op,i*64+ 4, (w1 >> 28) & 0x7fffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*23+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w1 >> 51) | (w2 << 13) & 0x7fffff, parm);\ - DST(op,i*64+ 6, (w2 >> 10) & 0x7fffff, parm);\ - DST(op,i*64+ 7, (w2 >> 33) & 0x7fffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*23+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w2 >> 56) | (w3 << 8) & 0x7fffff, parm);\ - DST(op,i*64+ 9, (w3 >> 15) & 0x7fffff, parm);\ - DST(op,i*64+10, (w3 >> 38) & 0x7fffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*23+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w3 >> 61) | (w4 << 3) & 0x7fffff, parm);\ - DST(op,i*64+12, (w4 >> 20) & 0x7fffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*23+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w4 >> 43) | (w5 << 21) & 0x7fffff, parm);\ - DST(op,i*64+14, (w5 >> 2) & 0x7fffff, parm);\ - DST(op,i*64+15, (w5 >> 25) & 0x7fffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*23+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w5 >> 48) | (w6 << 16) & 0x7fffff, parm);\ - DST(op,i*64+17, (w6 >> 7) & 0x7fffff, parm);\ - DST(op,i*64+18, (w6 >> 30) & 0x7fffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*23+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w6 >> 53) | (w7 << 11) & 0x7fffff, parm);\ - DST(op,i*64+20, (w7 >> 12) & 0x7fffff, parm);\ - DST(op,i*64+21, (w7 >> 35) & 0x7fffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*23+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w7 >> 58) | (w8 << 6) & 0x7fffff, parm);\ - DST(op,i*64+23, (w8 >> 17) & 0x7fffff, parm);\ - DST(op,i*64+24, (w8 >> 40) & 0x7fffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*23+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w8 >> 63) | (w9 << 1) & 0x7fffff, parm);\ - DST(op,i*64+26, (w9 >> 22) & 0x7fffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*23+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w9 >> 45) | (w10 << 19) & 0x7fffff, parm);\ - DST(op,i*64+28, (w10 >> 4) & 0x7fffff, parm);\ - DST(op,i*64+29, (w10 >> 27) & 0x7fffff, parm); register uint64_t w11 = *(uint32_t *)(ip+(i*23+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w10 >> 50) | (w11 << 14) & 0x7fffff, parm);\ - DST(op,i*64+31, (w11 >> 9) & 0x7fffff, parm);;\ -} - -#define BITUNPACK64_23(ip, op, parm) { \ - BITUNBLK64_23(ip, 0, op, parm); DSTI(op); ip += 23*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_24(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ - DST(op,i*8+ 0, (w0 ) & 0xffffff, parm);\ - DST(op,i*8+ 1, (w0 >> 24) & 0xffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 2, (w0 >> 48) | (w1 << 16) & 0xffffff, parm);\ - DST(op,i*8+ 3, (w1 >> 8) & 0xffffff, parm);\ - DST(op,i*8+ 4, (w1 >> 32) & 0xffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 5, (w1 >> 56) | (w2 << 8) & 0xffffff, parm);\ - DST(op,i*8+ 6, (w2 >> 16) & 0xffffff, parm);\ - DST(op,i*8+ 7, (w2 >> 40) , parm);;\ -} - -#define BITUNPACK64_24(ip, op, parm) { \ - BITUNBLK64_24(ip, 0, op, parm);\ - BITUNBLK64_24(ip, 1, op, parm);\ - BITUNBLK64_24(ip, 2, op, parm);\ - BITUNBLK64_24(ip, 3, op, parm); DSTI(op); ip += 24*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_25(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*25+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ffffff, parm);\ - DST(op,i*64+ 1, (w0 >> 25) & 0x1ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*25+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w0 >> 50) | (w1 << 14) & 0x1ffffff, parm);\ - DST(op,i*64+ 3, (w1 >> 11) & 0x1ffffff, parm);\ - DST(op,i*64+ 4, (w1 >> 36) & 0x1ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*25+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w1 >> 61) | (w2 << 3) & 0x1ffffff, parm);\ - DST(op,i*64+ 6, (w2 >> 22) & 0x1ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*25+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w2 >> 47) | (w3 << 17) & 0x1ffffff, parm);\ - DST(op,i*64+ 8, (w3 >> 8) & 0x1ffffff, parm);\ - DST(op,i*64+ 9, (w3 >> 33) & 0x1ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*25+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w3 >> 58) | (w4 << 6) & 0x1ffffff, parm);\ - DST(op,i*64+11, (w4 >> 19) & 0x1ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*25+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w4 >> 44) | (w5 << 20) & 0x1ffffff, parm);\ - DST(op,i*64+13, (w5 >> 5) & 0x1ffffff, parm);\ - DST(op,i*64+14, (w5 >> 30) & 0x1ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*25+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w5 >> 55) | (w6 << 9) & 0x1ffffff, parm);\ - DST(op,i*64+16, (w6 >> 16) & 0x1ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*25+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w6 >> 41) | (w7 << 23) & 0x1ffffff, parm);\ - DST(op,i*64+18, (w7 >> 2) & 0x1ffffff, parm);\ - DST(op,i*64+19, (w7 >> 27) & 0x1ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*25+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w7 >> 52) | (w8 << 12) & 0x1ffffff, parm);\ - DST(op,i*64+21, (w8 >> 13) & 0x1ffffff, parm);\ - DST(op,i*64+22, (w8 >> 38) & 0x1ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*25+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w8 >> 63) | (w9 << 1) & 0x1ffffff, parm);\ - DST(op,i*64+24, (w9 >> 24) & 0x1ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*25+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w9 >> 49) | (w10 << 15) & 0x1ffffff, parm);\ - DST(op,i*64+26, (w10 >> 10) & 0x1ffffff, parm);\ - DST(op,i*64+27, (w10 >> 35) & 0x1ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*25+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w10 >> 60) | (w11 << 4) & 0x1ffffff, parm);\ - DST(op,i*64+29, (w11 >> 21) & 0x1ffffff, parm); register uint64_t w12 = *(uint32_t *)(ip+(i*25+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w11 >> 46) | (w12 << 18) & 0x1ffffff, parm);\ - DST(op,i*64+31, (w12 >> 7) & 0x1ffffff, parm);;\ -} - -#define BITUNPACK64_25(ip, op, parm) { \ - BITUNBLK64_25(ip, 0, op, parm); DSTI(op); ip += 25*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_26(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ffffff, parm);\ - DST(op,i*32+ 1, (w0 >> 26) & 0x3ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w0 >> 52) | (w1 << 12) & 0x3ffffff, parm);\ - DST(op,i*32+ 3, (w1 >> 14) & 0x3ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w1 >> 40) | (w2 << 24) & 0x3ffffff, parm);\ - DST(op,i*32+ 5, (w2 >> 2) & 0x3ffffff, parm);\ - DST(op,i*32+ 6, (w2 >> 28) & 0x3ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w2 >> 54) | (w3 << 10) & 0x3ffffff, parm);\ - DST(op,i*32+ 8, (w3 >> 16) & 0x3ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w3 >> 42) | (w4 << 22) & 0x3ffffff, parm);\ - DST(op,i*32+10, (w4 >> 4) & 0x3ffffff, parm);\ - DST(op,i*32+11, (w4 >> 30) & 0x3ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w4 >> 56) | (w5 << 8) & 0x3ffffff, parm);\ - DST(op,i*32+13, (w5 >> 18) & 0x3ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w5 >> 44) | (w6 << 20) & 0x3ffffff, parm);\ - DST(op,i*32+15, (w6 >> 6) & 0x3ffffff, parm);\ - DST(op,i*32+16, (w6 >> 32) & 0x3ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*13+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w6 >> 58) | (w7 << 6) & 0x3ffffff, parm);\ - DST(op,i*32+18, (w7 >> 20) & 0x3ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*13+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w7 >> 46) | (w8 << 18) & 0x3ffffff, parm);\ - DST(op,i*32+20, (w8 >> 8) & 0x3ffffff, parm);\ - DST(op,i*32+21, (w8 >> 34) & 0x3ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*13+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w8 >> 60) | (w9 << 4) & 0x3ffffff, parm);\ - DST(op,i*32+23, (w9 >> 22) & 0x3ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*13+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w9 >> 48) | (w10 << 16) & 0x3ffffff, parm);\ - DST(op,i*32+25, (w10 >> 10) & 0x3ffffff, parm);\ - DST(op,i*32+26, (w10 >> 36) & 0x3ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*13+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w10 >> 62) | (w11 << 2) & 0x3ffffff, parm);\ - DST(op,i*32+28, (w11 >> 24) & 0x3ffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*13+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w11 >> 50) | (w12 << 14) & 0x3ffffff, parm);\ - DST(op,i*32+30, (w12 >> 12) & 0x3ffffff, parm);\ - DST(op,i*32+31, (w12 >> 38) , parm);;\ -} - -#define BITUNPACK64_26(ip, op, parm) { \ - BITUNBLK64_26(ip, 0, op, parm); DSTI(op); ip += 26*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_27(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*27+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ffffff, parm);\ - DST(op,i*64+ 1, (w0 >> 27) & 0x7ffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*27+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w0 >> 54) | (w1 << 10) & 0x7ffffff, parm);\ - DST(op,i*64+ 3, (w1 >> 17) & 0x7ffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*27+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w1 >> 44) | (w2 << 20) & 0x7ffffff, parm);\ - DST(op,i*64+ 5, (w2 >> 7) & 0x7ffffff, parm);\ - DST(op,i*64+ 6, (w2 >> 34) & 0x7ffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*27+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w2 >> 61) | (w3 << 3) & 0x7ffffff, parm);\ - DST(op,i*64+ 8, (w3 >> 24) & 0x7ffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*27+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w3 >> 51) | (w4 << 13) & 0x7ffffff, parm);\ - DST(op,i*64+10, (w4 >> 14) & 0x7ffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*27+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w4 >> 41) | (w5 << 23) & 0x7ffffff, parm);\ - DST(op,i*64+12, (w5 >> 4) & 0x7ffffff, parm);\ - DST(op,i*64+13, (w5 >> 31) & 0x7ffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*27+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w5 >> 58) | (w6 << 6) & 0x7ffffff, parm);\ - DST(op,i*64+15, (w6 >> 21) & 0x7ffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*27+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w6 >> 48) | (w7 << 16) & 0x7ffffff, parm);\ - DST(op,i*64+17, (w7 >> 11) & 0x7ffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*27+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w7 >> 38) | (w8 << 26) & 0x7ffffff, parm);\ - DST(op,i*64+19, (w8 >> 1) & 0x7ffffff, parm);\ - DST(op,i*64+20, (w8 >> 28) & 0x7ffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*27+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w8 >> 55) | (w9 << 9) & 0x7ffffff, parm);\ - DST(op,i*64+22, (w9 >> 18) & 0x7ffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*27+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w9 >> 45) | (w10 << 19) & 0x7ffffff, parm);\ - DST(op,i*64+24, (w10 >> 8) & 0x7ffffff, parm);\ - DST(op,i*64+25, (w10 >> 35) & 0x7ffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*27+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w10 >> 62) | (w11 << 2) & 0x7ffffff, parm);\ - DST(op,i*64+27, (w11 >> 25) & 0x7ffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*27+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w11 >> 52) | (w12 << 12) & 0x7ffffff, parm);\ - DST(op,i*64+29, (w12 >> 15) & 0x7ffffff, parm); register uint64_t w13 = *(uint32_t *)(ip+(i*27+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w12 >> 42) | (w13 << 22) & 0x7ffffff, parm);\ - DST(op,i*64+31, (w13 >> 5) & 0x7ffffff, parm);;\ -} - -#define BITUNPACK64_27(ip, op, parm) { \ - BITUNBLK64_27(ip, 0, op, parm); DSTI(op); ip += 27*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_28(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfffffff, parm);\ - DST(op,i*16+ 1, (w0 >> 28) & 0xfffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 2, (w0 >> 56) | (w1 << 8) & 0xfffffff, parm);\ - DST(op,i*16+ 3, (w1 >> 20) & 0xfffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 4, (w1 >> 48) | (w2 << 16) & 0xfffffff, parm);\ - DST(op,i*16+ 5, (w2 >> 12) & 0xfffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 6, (w2 >> 40) | (w3 << 24) & 0xfffffff, parm);\ - DST(op,i*16+ 7, (w3 >> 4) & 0xfffffff, parm);\ - DST(op,i*16+ 8, (w3 >> 32) & 0xfffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 9, (w3 >> 60) | (w4 << 4) & 0xfffffff, parm);\ - DST(op,i*16+10, (w4 >> 24) & 0xfffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+11, (w4 >> 52) | (w5 << 12) & 0xfffffff, parm);\ - DST(op,i*16+12, (w5 >> 16) & 0xfffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+13, (w5 >> 44) | (w6 << 20) & 0xfffffff, parm);\ - DST(op,i*16+14, (w6 >> 8) & 0xfffffff, parm);\ - DST(op,i*16+15, (w6 >> 36) , parm);;\ -} - -#define BITUNPACK64_28(ip, op, parm) { \ - BITUNBLK64_28(ip, 0, op, parm);\ - BITUNBLK64_28(ip, 1, op, parm); DSTI(op); ip += 28*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_29(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*29+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fffffff, parm);\ - DST(op,i*64+ 1, (w0 >> 29) & 0x1fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*29+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w0 >> 58) | (w1 << 6) & 0x1fffffff, parm);\ - DST(op,i*64+ 3, (w1 >> 23) & 0x1fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*29+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w1 >> 52) | (w2 << 12) & 0x1fffffff, parm);\ - DST(op,i*64+ 5, (w2 >> 17) & 0x1fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*29+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w2 >> 46) | (w3 << 18) & 0x1fffffff, parm);\ - DST(op,i*64+ 7, (w3 >> 11) & 0x1fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*29+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w3 >> 40) | (w4 << 24) & 0x1fffffff, parm);\ - DST(op,i*64+ 9, (w4 >> 5) & 0x1fffffff, parm);\ - DST(op,i*64+10, (w4 >> 34) & 0x1fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*29+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w4 >> 63) | (w5 << 1) & 0x1fffffff, parm);\ - DST(op,i*64+12, (w5 >> 28) & 0x1fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*29+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w5 >> 57) | (w6 << 7) & 0x1fffffff, parm);\ - DST(op,i*64+14, (w6 >> 22) & 0x1fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*29+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w6 >> 51) | (w7 << 13) & 0x1fffffff, parm);\ - DST(op,i*64+16, (w7 >> 16) & 0x1fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*29+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w7 >> 45) | (w8 << 19) & 0x1fffffff, parm);\ - DST(op,i*64+18, (w8 >> 10) & 0x1fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*29+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w8 >> 39) | (w9 << 25) & 0x1fffffff, parm);\ - DST(op,i*64+20, (w9 >> 4) & 0x1fffffff, parm);\ - DST(op,i*64+21, (w9 >> 33) & 0x1fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*29+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w9 >> 62) | (w10 << 2) & 0x1fffffff, parm);\ - DST(op,i*64+23, (w10 >> 27) & 0x1fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*29+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w10 >> 56) | (w11 << 8) & 0x1fffffff, parm);\ - DST(op,i*64+25, (w11 >> 21) & 0x1fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*29+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w11 >> 50) | (w12 << 14) & 0x1fffffff, parm);\ - DST(op,i*64+27, (w12 >> 15) & 0x1fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*29+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w12 >> 44) | (w13 << 20) & 0x1fffffff, parm);\ - DST(op,i*64+29, (w13 >> 9) & 0x1fffffff, parm); register uint64_t w14 = *(uint32_t *)(ip+(i*29+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w13 >> 38) | (w14 << 26) & 0x1fffffff, parm);\ - DST(op,i*64+31, (w14 >> 3) & 0x1fffffff, parm);;\ -} - -#define BITUNPACK64_29(ip, op, parm) { \ - BITUNBLK64_29(ip, 0, op, parm); DSTI(op); ip += 29*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_30(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fffffff, parm);\ - DST(op,i*32+ 1, (w0 >> 30) & 0x3fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w0 >> 60) | (w1 << 4) & 0x3fffffff, parm);\ - DST(op,i*32+ 3, (w1 >> 26) & 0x3fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w1 >> 56) | (w2 << 8) & 0x3fffffff, parm);\ - DST(op,i*32+ 5, (w2 >> 22) & 0x3fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w2 >> 52) | (w3 << 12) & 0x3fffffff, parm);\ - DST(op,i*32+ 7, (w3 >> 18) & 0x3fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w3 >> 48) | (w4 << 16) & 0x3fffffff, parm);\ - DST(op,i*32+ 9, (w4 >> 14) & 0x3fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w4 >> 44) | (w5 << 20) & 0x3fffffff, parm);\ - DST(op,i*32+11, (w5 >> 10) & 0x3fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w5 >> 40) | (w6 << 24) & 0x3fffffff, parm);\ - DST(op,i*32+13, (w6 >> 6) & 0x3fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w6 >> 36) | (w7 << 28) & 0x3fffffff, parm);\ - DST(op,i*32+15, (w7 >> 2) & 0x3fffffff, parm);\ - DST(op,i*32+16, (w7 >> 32) & 0x3fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*15+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w7 >> 62) | (w8 << 2) & 0x3fffffff, parm);\ - DST(op,i*32+18, (w8 >> 28) & 0x3fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*15+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w8 >> 58) | (w9 << 6) & 0x3fffffff, parm);\ - DST(op,i*32+20, (w9 >> 24) & 0x3fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*15+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w9 >> 54) | (w10 << 10) & 0x3fffffff, parm);\ - DST(op,i*32+22, (w10 >> 20) & 0x3fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*15+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w10 >> 50) | (w11 << 14) & 0x3fffffff, parm);\ - DST(op,i*32+24, (w11 >> 16) & 0x3fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*15+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w11 >> 46) | (w12 << 18) & 0x3fffffff, parm);\ - DST(op,i*32+26, (w12 >> 12) & 0x3fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*15+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w12 >> 42) | (w13 << 22) & 0x3fffffff, parm);\ - DST(op,i*32+28, (w13 >> 8) & 0x3fffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*15+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w13 >> 38) | (w14 << 26) & 0x3fffffff, parm);\ - DST(op,i*32+30, (w14 >> 4) & 0x3fffffff, parm);\ - DST(op,i*32+31, (w14 >> 34) , parm);;\ -} - -#define BITUNPACK64_30(ip, op, parm) { \ - BITUNBLK64_30(ip, 0, op, parm); DSTI(op); ip += 30*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_31(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*31+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fffffff, parm);\ - DST(op,i*64+ 1, (w0 >> 31) & 0x7fffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*31+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w0 >> 62) | (w1 << 2) & 0x7fffffff, parm);\ - DST(op,i*64+ 3, (w1 >> 29) & 0x7fffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*31+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w1 >> 60) | (w2 << 4) & 0x7fffffff, parm);\ - DST(op,i*64+ 5, (w2 >> 27) & 0x7fffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*31+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w2 >> 58) | (w3 << 6) & 0x7fffffff, parm);\ - DST(op,i*64+ 7, (w3 >> 25) & 0x7fffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*31+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w3 >> 56) | (w4 << 8) & 0x7fffffff, parm);\ - DST(op,i*64+ 9, (w4 >> 23) & 0x7fffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*31+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w4 >> 54) | (w5 << 10) & 0x7fffffff, parm);\ - DST(op,i*64+11, (w5 >> 21) & 0x7fffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*31+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w5 >> 52) | (w6 << 12) & 0x7fffffff, parm);\ - DST(op,i*64+13, (w6 >> 19) & 0x7fffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*31+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w6 >> 50) | (w7 << 14) & 0x7fffffff, parm);\ - DST(op,i*64+15, (w7 >> 17) & 0x7fffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*31+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w7 >> 48) | (w8 << 16) & 0x7fffffff, parm);\ - DST(op,i*64+17, (w8 >> 15) & 0x7fffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*31+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w8 >> 46) | (w9 << 18) & 0x7fffffff, parm);\ - DST(op,i*64+19, (w9 >> 13) & 0x7fffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*31+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w9 >> 44) | (w10 << 20) & 0x7fffffff, parm);\ - DST(op,i*64+21, (w10 >> 11) & 0x7fffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*31+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w10 >> 42) | (w11 << 22) & 0x7fffffff, parm);\ - DST(op,i*64+23, (w11 >> 9) & 0x7fffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*31+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w11 >> 40) | (w12 << 24) & 0x7fffffff, parm);\ - DST(op,i*64+25, (w12 >> 7) & 0x7fffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*31+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w12 >> 38) | (w13 << 26) & 0x7fffffff, parm);\ - DST(op,i*64+27, (w13 >> 5) & 0x7fffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*31+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w13 >> 36) | (w14 << 28) & 0x7fffffff, parm);\ - DST(op,i*64+29, (w14 >> 3) & 0x7fffffff, parm); register uint64_t w15 = *(uint32_t *)(ip+(i*31+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w14 >> 34) | (w15 << 30) & 0x7fffffff, parm);\ - DST(op,i*64+31, (w15 >> 1) & 0x7fffffff, parm);;\ -} - -#define BITUNPACK64_31(ip, op, parm) { \ - BITUNBLK64_31(ip, 0, op, parm); DSTI(op); ip += 31*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_32(ip, i, op, parm) { \ - DST(op,i*2+ 0, *(uint32_t *)(ip+i*8+ 0), parm);\ - DST(op,i*2+ 1, *(uint32_t *)(ip+i*8+ 4), parm);;\ -} - -#define BITUNPACK64_32(ip, op, parm) { \ - BITUNBLK64_32(ip, 0, op, parm);\ - BITUNBLK64_32(ip, 1, op, parm);\ - BITUNBLK64_32(ip, 2, op, parm);\ - BITUNBLK64_32(ip, 3, op, parm);\ - BITUNBLK64_32(ip, 4, op, parm);\ - BITUNBLK64_32(ip, 5, op, parm);\ - BITUNBLK64_32(ip, 6, op, parm);\ - BITUNBLK64_32(ip, 7, op, parm);\ - BITUNBLK64_32(ip, 8, op, parm);\ - BITUNBLK64_32(ip, 9, op, parm);\ - BITUNBLK64_32(ip, 10, op, parm);\ - BITUNBLK64_32(ip, 11, op, parm);\ - BITUNBLK64_32(ip, 12, op, parm);\ - BITUNBLK64_32(ip, 13, op, parm);\ - BITUNBLK64_32(ip, 14, op, parm);\ - BITUNBLK64_32(ip, 15, op, parm); DSTI(op); ip += 32*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_33(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*33+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*33+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 33) | (w1 << 31) & 0x1ffffffff, parm);\ - DST(op,i*64+ 2, (w1 >> 2) & 0x1ffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*33+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w1 >> 35) | (w2 << 29) & 0x1ffffffff, parm);\ - DST(op,i*64+ 4, (w2 >> 4) & 0x1ffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*33+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w2 >> 37) | (w3 << 27) & 0x1ffffffff, parm);\ - DST(op,i*64+ 6, (w3 >> 6) & 0x1ffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*33+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w3 >> 39) | (w4 << 25) & 0x1ffffffff, parm);\ - DST(op,i*64+ 8, (w4 >> 8) & 0x1ffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*33+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w4 >> 41) | (w5 << 23) & 0x1ffffffff, parm);\ - DST(op,i*64+10, (w5 >> 10) & 0x1ffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*33+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w5 >> 43) | (w6 << 21) & 0x1ffffffff, parm);\ - DST(op,i*64+12, (w6 >> 12) & 0x1ffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*33+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w6 >> 45) | (w7 << 19) & 0x1ffffffff, parm);\ - DST(op,i*64+14, (w7 >> 14) & 0x1ffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*33+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w7 >> 47) | (w8 << 17) & 0x1ffffffff, parm);\ - DST(op,i*64+16, (w8 >> 16) & 0x1ffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*33+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w8 >> 49) | (w9 << 15) & 0x1ffffffff, parm);\ - DST(op,i*64+18, (w9 >> 18) & 0x1ffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*33+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w9 >> 51) | (w10 << 13) & 0x1ffffffff, parm);\ - DST(op,i*64+20, (w10 >> 20) & 0x1ffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*33+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w10 >> 53) | (w11 << 11) & 0x1ffffffff, parm);\ - DST(op,i*64+22, (w11 >> 22) & 0x1ffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*33+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w11 >> 55) | (w12 << 9) & 0x1ffffffff, parm);\ - DST(op,i*64+24, (w12 >> 24) & 0x1ffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*33+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w12 >> 57) | (w13 << 7) & 0x1ffffffff, parm);\ - DST(op,i*64+26, (w13 >> 26) & 0x1ffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*33+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w13 >> 59) | (w14 << 5) & 0x1ffffffff, parm);\ - DST(op,i*64+28, (w14 >> 28) & 0x1ffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*33+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w14 >> 61) | (w15 << 3) & 0x1ffffffff, parm);\ - DST(op,i*64+30, (w15 >> 30) & 0x1ffffffff, parm); register uint64_t w16 = *(uint32_t *)(ip+(i*33+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w15 >> 63) | (w16 << 1) & 0x1ffffffff, parm);;\ -} - -#define BITUNPACK64_33(ip, op, parm) { \ - BITUNBLK64_33(ip, 0, op, parm); DSTI(op); ip += 33*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_34(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*17+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*17+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 34) | (w1 << 30) & 0x3ffffffff, parm);\ - DST(op,i*32+ 2, (w1 >> 4) & 0x3ffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*17+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w1 >> 38) | (w2 << 26) & 0x3ffffffff, parm);\ - DST(op,i*32+ 4, (w2 >> 8) & 0x3ffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*17+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w2 >> 42) | (w3 << 22) & 0x3ffffffff, parm);\ - DST(op,i*32+ 6, (w3 >> 12) & 0x3ffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*17+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w3 >> 46) | (w4 << 18) & 0x3ffffffff, parm);\ - DST(op,i*32+ 8, (w4 >> 16) & 0x3ffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*17+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w4 >> 50) | (w5 << 14) & 0x3ffffffff, parm);\ - DST(op,i*32+10, (w5 >> 20) & 0x3ffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*17+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w5 >> 54) | (w6 << 10) & 0x3ffffffff, parm);\ - DST(op,i*32+12, (w6 >> 24) & 0x3ffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*17+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w6 >> 58) | (w7 << 6) & 0x3ffffffff, parm);\ - DST(op,i*32+14, (w7 >> 28) & 0x3ffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*17+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w7 >> 62) | (w8 << 2) & 0x3ffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*17+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w8 >> 32) | (w9 << 32) & 0x3ffffffff, parm);\ - DST(op,i*32+17, (w9 >> 2) & 0x3ffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*17+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w9 >> 36) | (w10 << 28) & 0x3ffffffff, parm);\ - DST(op,i*32+19, (w10 >> 6) & 0x3ffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*17+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w10 >> 40) | (w11 << 24) & 0x3ffffffff, parm);\ - DST(op,i*32+21, (w11 >> 10) & 0x3ffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*17+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w11 >> 44) | (w12 << 20) & 0x3ffffffff, parm);\ - DST(op,i*32+23, (w12 >> 14) & 0x3ffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*17+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w12 >> 48) | (w13 << 16) & 0x3ffffffff, parm);\ - DST(op,i*32+25, (w13 >> 18) & 0x3ffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*17+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w13 >> 52) | (w14 << 12) & 0x3ffffffff, parm);\ - DST(op,i*32+27, (w14 >> 22) & 0x3ffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*17+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w14 >> 56) | (w15 << 8) & 0x3ffffffff, parm);\ - DST(op,i*32+29, (w15 >> 26) & 0x3ffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*17+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w15 >> 60) | (w16 << 4) & 0x3ffffffff, parm);\ - DST(op,i*32+31, (w16 >> 30) , parm);;\ -} - -#define BITUNPACK64_34(ip, op, parm) { \ - BITUNBLK64_34(ip, 0, op, parm); DSTI(op); ip += 34*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_35(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*35+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*35+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 35) | (w1 << 29) & 0x7ffffffff, parm);\ - DST(op,i*64+ 2, (w1 >> 6) & 0x7ffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*35+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w1 >> 41) | (w2 << 23) & 0x7ffffffff, parm);\ - DST(op,i*64+ 4, (w2 >> 12) & 0x7ffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*35+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w2 >> 47) | (w3 << 17) & 0x7ffffffff, parm);\ - DST(op,i*64+ 6, (w3 >> 18) & 0x7ffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*35+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w3 >> 53) | (w4 << 11) & 0x7ffffffff, parm);\ - DST(op,i*64+ 8, (w4 >> 24) & 0x7ffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*35+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w4 >> 59) | (w5 << 5) & 0x7ffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*35+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w5 >> 30) | (w6 << 34) & 0x7ffffffff, parm);\ - DST(op,i*64+11, (w6 >> 1) & 0x7ffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*35+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w6 >> 36) | (w7 << 28) & 0x7ffffffff, parm);\ - DST(op,i*64+13, (w7 >> 7) & 0x7ffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*35+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w7 >> 42) | (w8 << 22) & 0x7ffffffff, parm);\ - DST(op,i*64+15, (w8 >> 13) & 0x7ffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*35+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w8 >> 48) | (w9 << 16) & 0x7ffffffff, parm);\ - DST(op,i*64+17, (w9 >> 19) & 0x7ffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*35+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w9 >> 54) | (w10 << 10) & 0x7ffffffff, parm);\ - DST(op,i*64+19, (w10 >> 25) & 0x7ffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*35+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w10 >> 60) | (w11 << 4) & 0x7ffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*35+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w11 >> 31) | (w12 << 33) & 0x7ffffffff, parm);\ - DST(op,i*64+22, (w12 >> 2) & 0x7ffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*35+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w12 >> 37) | (w13 << 27) & 0x7ffffffff, parm);\ - DST(op,i*64+24, (w13 >> 8) & 0x7ffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*35+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w13 >> 43) | (w14 << 21) & 0x7ffffffff, parm);\ - DST(op,i*64+26, (w14 >> 14) & 0x7ffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*35+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w14 >> 49) | (w15 << 15) & 0x7ffffffff, parm);\ - DST(op,i*64+28, (w15 >> 20) & 0x7ffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*35+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w15 >> 55) | (w16 << 9) & 0x7ffffffff, parm);\ - DST(op,i*64+30, (w16 >> 26) & 0x7ffffffff, parm); register uint64_t w17 = *(uint32_t *)(ip+(i*35+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w16 >> 61) | (w17 << 3) & 0x7ffffffff, parm);;\ -} - -#define BITUNPACK64_35(ip, op, parm) { \ - BITUNBLK64_35(ip, 0, op, parm); DSTI(op); ip += 35*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_36(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*9+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*9+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 1, (w0 >> 36) | (w1 << 28) & 0xfffffffff, parm);\ - DST(op,i*16+ 2, (w1 >> 8) & 0xfffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*9+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 3, (w1 >> 44) | (w2 << 20) & 0xfffffffff, parm);\ - DST(op,i*16+ 4, (w2 >> 16) & 0xfffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*9+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 5, (w2 >> 52) | (w3 << 12) & 0xfffffffff, parm);\ - DST(op,i*16+ 6, (w3 >> 24) & 0xfffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*9+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 7, (w3 >> 60) | (w4 << 4) & 0xfffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*9+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 8, (w4 >> 32) | (w5 << 32) & 0xfffffffff, parm);\ - DST(op,i*16+ 9, (w5 >> 4) & 0xfffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*9+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+10, (w5 >> 40) | (w6 << 24) & 0xfffffffff, parm);\ - DST(op,i*16+11, (w6 >> 12) & 0xfffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*9+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+12, (w6 >> 48) | (w7 << 16) & 0xfffffffff, parm);\ - DST(op,i*16+13, (w7 >> 20) & 0xfffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*9+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+14, (w7 >> 56) | (w8 << 8) & 0xfffffffff, parm);\ - DST(op,i*16+15, (w8 >> 28) , parm);;\ -} - -#define BITUNPACK64_36(ip, op, parm) { \ - BITUNBLK64_36(ip, 0, op, parm);\ - BITUNBLK64_36(ip, 1, op, parm); DSTI(op); ip += 36*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_37(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*37+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*37+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 37) | (w1 << 27) & 0x1fffffffff, parm);\ - DST(op,i*64+ 2, (w1 >> 10) & 0x1fffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*37+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w1 >> 47) | (w2 << 17) & 0x1fffffffff, parm);\ - DST(op,i*64+ 4, (w2 >> 20) & 0x1fffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*37+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w2 >> 57) | (w3 << 7) & 0x1fffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*37+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w3 >> 30) | (w4 << 34) & 0x1fffffffff, parm);\ - DST(op,i*64+ 7, (w4 >> 3) & 0x1fffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*37+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w4 >> 40) | (w5 << 24) & 0x1fffffffff, parm);\ - DST(op,i*64+ 9, (w5 >> 13) & 0x1fffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*37+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w5 >> 50) | (w6 << 14) & 0x1fffffffff, parm);\ - DST(op,i*64+11, (w6 >> 23) & 0x1fffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*37+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w6 >> 60) | (w7 << 4) & 0x1fffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*37+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w7 >> 33) | (w8 << 31) & 0x1fffffffff, parm);\ - DST(op,i*64+14, (w8 >> 6) & 0x1fffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*37+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w8 >> 43) | (w9 << 21) & 0x1fffffffff, parm);\ - DST(op,i*64+16, (w9 >> 16) & 0x1fffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*37+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w9 >> 53) | (w10 << 11) & 0x1fffffffff, parm);\ - DST(op,i*64+18, (w10 >> 26) & 0x1fffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*37+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w10 >> 63) | (w11 << 1) & 0x1fffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*37+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w11 >> 36) | (w12 << 28) & 0x1fffffffff, parm);\ - DST(op,i*64+21, (w12 >> 9) & 0x1fffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*37+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w12 >> 46) | (w13 << 18) & 0x1fffffffff, parm);\ - DST(op,i*64+23, (w13 >> 19) & 0x1fffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*37+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w13 >> 56) | (w14 << 8) & 0x1fffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*37+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w14 >> 29) | (w15 << 35) & 0x1fffffffff, parm);\ - DST(op,i*64+26, (w15 >> 2) & 0x1fffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*37+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w15 >> 39) | (w16 << 25) & 0x1fffffffff, parm);\ - DST(op,i*64+28, (w16 >> 12) & 0x1fffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*37+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w16 >> 49) | (w17 << 15) & 0x1fffffffff, parm);\ - DST(op,i*64+30, (w17 >> 22) & 0x1fffffffff, parm); register uint64_t w18 = *(uint32_t *)(ip+(i*37+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w17 >> 59) | (w18 << 5) & 0x1fffffffff, parm);;\ -} - -#define BITUNPACK64_37(ip, op, parm) { \ - BITUNBLK64_37(ip, 0, op, parm); DSTI(op); ip += 37*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_38(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*19+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*19+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 38) | (w1 << 26) & 0x3fffffffff, parm);\ - DST(op,i*32+ 2, (w1 >> 12) & 0x3fffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*19+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w1 >> 50) | (w2 << 14) & 0x3fffffffff, parm);\ - DST(op,i*32+ 4, (w2 >> 24) & 0x3fffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*19+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w2 >> 62) | (w3 << 2) & 0x3fffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*19+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w3 >> 36) | (w4 << 28) & 0x3fffffffff, parm);\ - DST(op,i*32+ 7, (w4 >> 10) & 0x3fffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*19+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w4 >> 48) | (w5 << 16) & 0x3fffffffff, parm);\ - DST(op,i*32+ 9, (w5 >> 22) & 0x3fffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*19+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w5 >> 60) | (w6 << 4) & 0x3fffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*19+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w6 >> 34) | (w7 << 30) & 0x3fffffffff, parm);\ - DST(op,i*32+12, (w7 >> 8) & 0x3fffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*19+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w7 >> 46) | (w8 << 18) & 0x3fffffffff, parm);\ - DST(op,i*32+14, (w8 >> 20) & 0x3fffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*19+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w8 >> 58) | (w9 << 6) & 0x3fffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*19+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w9 >> 32) | (w10 << 32) & 0x3fffffffff, parm);\ - DST(op,i*32+17, (w10 >> 6) & 0x3fffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*19+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w10 >> 44) | (w11 << 20) & 0x3fffffffff, parm);\ - DST(op,i*32+19, (w11 >> 18) & 0x3fffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*19+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w11 >> 56) | (w12 << 8) & 0x3fffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*19+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w12 >> 30) | (w13 << 34) & 0x3fffffffff, parm);\ - DST(op,i*32+22, (w13 >> 4) & 0x3fffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*19+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w13 >> 42) | (w14 << 22) & 0x3fffffffff, parm);\ - DST(op,i*32+24, (w14 >> 16) & 0x3fffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*19+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w14 >> 54) | (w15 << 10) & 0x3fffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*19+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w15 >> 28) | (w16 << 36) & 0x3fffffffff, parm);\ - DST(op,i*32+27, (w16 >> 2) & 0x3fffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*19+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w16 >> 40) | (w17 << 24) & 0x3fffffffff, parm);\ - DST(op,i*32+29, (w17 >> 14) & 0x3fffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*19+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w17 >> 52) | (w18 << 12) & 0x3fffffffff, parm);\ - DST(op,i*32+31, (w18 >> 26) , parm);;\ -} - -#define BITUNPACK64_38(ip, op, parm) { \ - BITUNBLK64_38(ip, 0, op, parm); DSTI(op); ip += 38*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_39(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*39+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*39+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 39) | (w1 << 25) & 0x7fffffffff, parm);\ - DST(op,i*64+ 2, (w1 >> 14) & 0x7fffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*39+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w1 >> 53) | (w2 << 11) & 0x7fffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*39+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w2 >> 28) | (w3 << 36) & 0x7fffffffff, parm);\ - DST(op,i*64+ 5, (w3 >> 3) & 0x7fffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*39+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w3 >> 42) | (w4 << 22) & 0x7fffffffff, parm);\ - DST(op,i*64+ 7, (w4 >> 17) & 0x7fffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*39+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w4 >> 56) | (w5 << 8) & 0x7fffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*39+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w5 >> 31) | (w6 << 33) & 0x7fffffffff, parm);\ - DST(op,i*64+10, (w6 >> 6) & 0x7fffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*39+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w6 >> 45) | (w7 << 19) & 0x7fffffffff, parm);\ - DST(op,i*64+12, (w7 >> 20) & 0x7fffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*39+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w7 >> 59) | (w8 << 5) & 0x7fffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*39+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w8 >> 34) | (w9 << 30) & 0x7fffffffff, parm);\ - DST(op,i*64+15, (w9 >> 9) & 0x7fffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*39+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w9 >> 48) | (w10 << 16) & 0x7fffffffff, parm);\ - DST(op,i*64+17, (w10 >> 23) & 0x7fffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*39+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w10 >> 62) | (w11 << 2) & 0x7fffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*39+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w11 >> 37) | (w12 << 27) & 0x7fffffffff, parm);\ - DST(op,i*64+20, (w12 >> 12) & 0x7fffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*39+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w12 >> 51) | (w13 << 13) & 0x7fffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*39+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w13 >> 26) | (w14 << 38) & 0x7fffffffff, parm);\ - DST(op,i*64+23, (w14 >> 1) & 0x7fffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*39+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w14 >> 40) | (w15 << 24) & 0x7fffffffff, parm);\ - DST(op,i*64+25, (w15 >> 15) & 0x7fffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*39+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w15 >> 54) | (w16 << 10) & 0x7fffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*39+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w16 >> 29) | (w17 << 35) & 0x7fffffffff, parm);\ - DST(op,i*64+28, (w17 >> 4) & 0x7fffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*39+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w17 >> 43) | (w18 << 21) & 0x7fffffffff, parm);\ - DST(op,i*64+30, (w18 >> 18) & 0x7fffffffff, parm); register uint64_t w19 = *(uint32_t *)(ip+(i*39+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w18 >> 57) | (w19 << 7) & 0x7fffffffff, parm);;\ -} - -#define BITUNPACK64_39(ip, op, parm) { \ - BITUNBLK64_39(ip, 0, op, parm); DSTI(op); ip += 39*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_40(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*5+0)*8/sizeof(ip[0]));\ - DST(op,i*8+ 0, (w0 ) & 0xffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*5+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 1, (w0 >> 40) | (w1 << 24) & 0xffffffffff, parm);\ - DST(op,i*8+ 2, (w1 >> 16) & 0xffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*5+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 3, (w1 >> 56) | (w2 << 8) & 0xffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*5+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 4, (w2 >> 32) | (w3 << 32) & 0xffffffffff, parm);\ - DST(op,i*8+ 5, (w3 >> 8) & 0xffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*5+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 6, (w3 >> 48) | (w4 << 16) & 0xffffffffff, parm);\ - DST(op,i*8+ 7, (w4 >> 24) , parm);;\ -} - -#define BITUNPACK64_40(ip, op, parm) { \ - BITUNBLK64_40(ip, 0, op, parm);\ - BITUNBLK64_40(ip, 1, op, parm);\ - BITUNBLK64_40(ip, 2, op, parm);\ - BITUNBLK64_40(ip, 3, op, parm); DSTI(op); ip += 40*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_41(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*41+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*41+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 41) | (w1 << 23) & 0x1ffffffffff, parm);\ - DST(op,i*64+ 2, (w1 >> 18) & 0x1ffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*41+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w1 >> 59) | (w2 << 5) & 0x1ffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*41+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w2 >> 36) | (w3 << 28) & 0x1ffffffffff, parm);\ - DST(op,i*64+ 5, (w3 >> 13) & 0x1ffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*41+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w3 >> 54) | (w4 << 10) & 0x1ffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*41+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w4 >> 31) | (w5 << 33) & 0x1ffffffffff, parm);\ - DST(op,i*64+ 8, (w5 >> 8) & 0x1ffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*41+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w5 >> 49) | (w6 << 15) & 0x1ffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*41+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w6 >> 26) | (w7 << 38) & 0x1ffffffffff, parm);\ - DST(op,i*64+11, (w7 >> 3) & 0x1ffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*41+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w7 >> 44) | (w8 << 20) & 0x1ffffffffff, parm);\ - DST(op,i*64+13, (w8 >> 21) & 0x1ffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*41+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w8 >> 62) | (w9 << 2) & 0x1ffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*41+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w9 >> 39) | (w10 << 25) & 0x1ffffffffff, parm);\ - DST(op,i*64+16, (w10 >> 16) & 0x1ffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*41+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w10 >> 57) | (w11 << 7) & 0x1ffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*41+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w11 >> 34) | (w12 << 30) & 0x1ffffffffff, parm);\ - DST(op,i*64+19, (w12 >> 11) & 0x1ffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*41+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w12 >> 52) | (w13 << 12) & 0x1ffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*41+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w13 >> 29) | (w14 << 35) & 0x1ffffffffff, parm);\ - DST(op,i*64+22, (w14 >> 6) & 0x1ffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*41+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w14 >> 47) | (w15 << 17) & 0x1ffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*41+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w15 >> 24) | (w16 << 40) & 0x1ffffffffff, parm);\ - DST(op,i*64+25, (w16 >> 1) & 0x1ffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*41+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w16 >> 42) | (w17 << 22) & 0x1ffffffffff, parm);\ - DST(op,i*64+27, (w17 >> 19) & 0x1ffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*41+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w17 >> 60) | (w18 << 4) & 0x1ffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*41+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w18 >> 37) | (w19 << 27) & 0x1ffffffffff, parm);\ - DST(op,i*64+30, (w19 >> 14) & 0x1ffffffffff, parm); register uint64_t w20 = *(uint32_t *)(ip+(i*41+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w19 >> 55) | (w20 << 9) & 0x1ffffffffff, parm);;\ -} - -#define BITUNPACK64_41(ip, op, parm) { \ - BITUNBLK64_41(ip, 0, op, parm); DSTI(op); ip += 41*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_42(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*21+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*21+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 42) | (w1 << 22) & 0x3ffffffffff, parm);\ - DST(op,i*32+ 2, (w1 >> 20) & 0x3ffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*21+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w1 >> 62) | (w2 << 2) & 0x3ffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*21+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w2 >> 40) | (w3 << 24) & 0x3ffffffffff, parm);\ - DST(op,i*32+ 5, (w3 >> 18) & 0x3ffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*21+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w3 >> 60) | (w4 << 4) & 0x3ffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*21+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w4 >> 38) | (w5 << 26) & 0x3ffffffffff, parm);\ - DST(op,i*32+ 8, (w5 >> 16) & 0x3ffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*21+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w5 >> 58) | (w6 << 6) & 0x3ffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*21+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w6 >> 36) | (w7 << 28) & 0x3ffffffffff, parm);\ - DST(op,i*32+11, (w7 >> 14) & 0x3ffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*21+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w7 >> 56) | (w8 << 8) & 0x3ffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*21+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w8 >> 34) | (w9 << 30) & 0x3ffffffffff, parm);\ - DST(op,i*32+14, (w9 >> 12) & 0x3ffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*21+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w9 >> 54) | (w10 << 10) & 0x3ffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*21+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w10 >> 32) | (w11 << 32) & 0x3ffffffffff, parm);\ - DST(op,i*32+17, (w11 >> 10) & 0x3ffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*21+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w11 >> 52) | (w12 << 12) & 0x3ffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*21+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w12 >> 30) | (w13 << 34) & 0x3ffffffffff, parm);\ - DST(op,i*32+20, (w13 >> 8) & 0x3ffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*21+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w13 >> 50) | (w14 << 14) & 0x3ffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*21+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w14 >> 28) | (w15 << 36) & 0x3ffffffffff, parm);\ - DST(op,i*32+23, (w15 >> 6) & 0x3ffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*21+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w15 >> 48) | (w16 << 16) & 0x3ffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*21+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w16 >> 26) | (w17 << 38) & 0x3ffffffffff, parm);\ - DST(op,i*32+26, (w17 >> 4) & 0x3ffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*21+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w17 >> 46) | (w18 << 18) & 0x3ffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*21+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w18 >> 24) | (w19 << 40) & 0x3ffffffffff, parm);\ - DST(op,i*32+29, (w19 >> 2) & 0x3ffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*21+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w19 >> 44) | (w20 << 20) & 0x3ffffffffff, parm);\ - DST(op,i*32+31, (w20 >> 22) , parm);;\ -} - -#define BITUNPACK64_42(ip, op, parm) { \ - BITUNBLK64_42(ip, 0, op, parm); DSTI(op); ip += 42*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_43(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*43+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*43+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 43) | (w1 << 21) & 0x7ffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*43+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 22) | (w2 << 42) & 0x7ffffffffff, parm);\ - DST(op,i*64+ 3, (w2 >> 1) & 0x7ffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*43+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w2 >> 44) | (w3 << 20) & 0x7ffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*43+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w3 >> 23) | (w4 << 41) & 0x7ffffffffff, parm);\ - DST(op,i*64+ 6, (w4 >> 2) & 0x7ffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*43+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w4 >> 45) | (w5 << 19) & 0x7ffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*43+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w5 >> 24) | (w6 << 40) & 0x7ffffffffff, parm);\ - DST(op,i*64+ 9, (w6 >> 3) & 0x7ffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*43+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w6 >> 46) | (w7 << 18) & 0x7ffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*43+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w7 >> 25) | (w8 << 39) & 0x7ffffffffff, parm);\ - DST(op,i*64+12, (w8 >> 4) & 0x7ffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*43+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w8 >> 47) | (w9 << 17) & 0x7ffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*43+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w9 >> 26) | (w10 << 38) & 0x7ffffffffff, parm);\ - DST(op,i*64+15, (w10 >> 5) & 0x7ffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*43+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w10 >> 48) | (w11 << 16) & 0x7ffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*43+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w11 >> 27) | (w12 << 37) & 0x7ffffffffff, parm);\ - DST(op,i*64+18, (w12 >> 6) & 0x7ffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*43+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w12 >> 49) | (w13 << 15) & 0x7ffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*43+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w13 >> 28) | (w14 << 36) & 0x7ffffffffff, parm);\ - DST(op,i*64+21, (w14 >> 7) & 0x7ffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*43+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w14 >> 50) | (w15 << 14) & 0x7ffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*43+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w15 >> 29) | (w16 << 35) & 0x7ffffffffff, parm);\ - DST(op,i*64+24, (w16 >> 8) & 0x7ffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*43+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w16 >> 51) | (w17 << 13) & 0x7ffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*43+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w17 >> 30) | (w18 << 34) & 0x7ffffffffff, parm);\ - DST(op,i*64+27, (w18 >> 9) & 0x7ffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*43+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w18 >> 52) | (w19 << 12) & 0x7ffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*43+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w19 >> 31) | (w20 << 33) & 0x7ffffffffff, parm);\ - DST(op,i*64+30, (w20 >> 10) & 0x7ffffffffff, parm); register uint64_t w21 = *(uint32_t *)(ip+(i*43+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w20 >> 53) | (w21 << 11) & 0x7ffffffffff, parm);;\ -} - -#define BITUNPACK64_43(ip, op, parm) { \ - BITUNBLK64_43(ip, 0, op, parm); DSTI(op); ip += 43*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_44(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*11+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*11+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 1, (w0 >> 44) | (w1 << 20) & 0xfffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*11+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 2, (w1 >> 24) | (w2 << 40) & 0xfffffffffff, parm);\ - DST(op,i*16+ 3, (w2 >> 4) & 0xfffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*11+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 4, (w2 >> 48) | (w3 << 16) & 0xfffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*11+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 5, (w3 >> 28) | (w4 << 36) & 0xfffffffffff, parm);\ - DST(op,i*16+ 6, (w4 >> 8) & 0xfffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*11+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 7, (w4 >> 52) | (w5 << 12) & 0xfffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*11+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 8, (w5 >> 32) | (w6 << 32) & 0xfffffffffff, parm);\ - DST(op,i*16+ 9, (w6 >> 12) & 0xfffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*11+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+10, (w6 >> 56) | (w7 << 8) & 0xfffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*11+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+11, (w7 >> 36) | (w8 << 28) & 0xfffffffffff, parm);\ - DST(op,i*16+12, (w8 >> 16) & 0xfffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*11+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+13, (w8 >> 60) | (w9 << 4) & 0xfffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*11+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+14, (w9 >> 40) | (w10 << 24) & 0xfffffffffff, parm);\ - DST(op,i*16+15, (w10 >> 20) , parm);;\ -} - -#define BITUNPACK64_44(ip, op, parm) { \ - BITUNBLK64_44(ip, 0, op, parm);\ - BITUNBLK64_44(ip, 1, op, parm); DSTI(op); ip += 44*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_45(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*45+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*45+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 45) | (w1 << 19) & 0x1fffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*45+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 26) | (w2 << 38) & 0x1fffffffffff, parm);\ - DST(op,i*64+ 3, (w2 >> 7) & 0x1fffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*45+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w2 >> 52) | (w3 << 12) & 0x1fffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*45+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w3 >> 33) | (w4 << 31) & 0x1fffffffffff, parm);\ - DST(op,i*64+ 6, (w4 >> 14) & 0x1fffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*45+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w4 >> 59) | (w5 << 5) & 0x1fffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*45+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w5 >> 40) | (w6 << 24) & 0x1fffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*45+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w6 >> 21) | (w7 << 43) & 0x1fffffffffff, parm);\ - DST(op,i*64+10, (w7 >> 2) & 0x1fffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*45+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w7 >> 47) | (w8 << 17) & 0x1fffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*45+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w8 >> 28) | (w9 << 36) & 0x1fffffffffff, parm);\ - DST(op,i*64+13, (w9 >> 9) & 0x1fffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*45+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w9 >> 54) | (w10 << 10) & 0x1fffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*45+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w10 >> 35) | (w11 << 29) & 0x1fffffffffff, parm);\ - DST(op,i*64+16, (w11 >> 16) & 0x1fffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*45+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w11 >> 61) | (w12 << 3) & 0x1fffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*45+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w12 >> 42) | (w13 << 22) & 0x1fffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*45+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w13 >> 23) | (w14 << 41) & 0x1fffffffffff, parm);\ - DST(op,i*64+20, (w14 >> 4) & 0x1fffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*45+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w14 >> 49) | (w15 << 15) & 0x1fffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*45+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w15 >> 30) | (w16 << 34) & 0x1fffffffffff, parm);\ - DST(op,i*64+23, (w16 >> 11) & 0x1fffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*45+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w16 >> 56) | (w17 << 8) & 0x1fffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*45+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w17 >> 37) | (w18 << 27) & 0x1fffffffffff, parm);\ - DST(op,i*64+26, (w18 >> 18) & 0x1fffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*45+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w18 >> 63) | (w19 << 1) & 0x1fffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*45+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w19 >> 44) | (w20 << 20) & 0x1fffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*45+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w20 >> 25) | (w21 << 39) & 0x1fffffffffff, parm);\ - DST(op,i*64+30, (w21 >> 6) & 0x1fffffffffff, parm); register uint64_t w22 = *(uint32_t *)(ip+(i*45+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w21 >> 51) | (w22 << 13) & 0x1fffffffffff, parm);;\ -} - -#define BITUNPACK64_45(ip, op, parm) { \ - BITUNBLK64_45(ip, 0, op, parm); DSTI(op); ip += 45*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_46(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*23+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*23+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 46) | (w1 << 18) & 0x3fffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*23+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w1 >> 28) | (w2 << 36) & 0x3fffffffffff, parm);\ - DST(op,i*32+ 3, (w2 >> 10) & 0x3fffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*23+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w2 >> 56) | (w3 << 8) & 0x3fffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*23+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w3 >> 38) | (w4 << 26) & 0x3fffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*23+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w4 >> 20) | (w5 << 44) & 0x3fffffffffff, parm);\ - DST(op,i*32+ 7, (w5 >> 2) & 0x3fffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*23+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w5 >> 48) | (w6 << 16) & 0x3fffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*23+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w6 >> 30) | (w7 << 34) & 0x3fffffffffff, parm);\ - DST(op,i*32+10, (w7 >> 12) & 0x3fffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*23+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w7 >> 58) | (w8 << 6) & 0x3fffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*23+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w8 >> 40) | (w9 << 24) & 0x3fffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*23+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w9 >> 22) | (w10 << 42) & 0x3fffffffffff, parm);\ - DST(op,i*32+14, (w10 >> 4) & 0x3fffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*23+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w10 >> 50) | (w11 << 14) & 0x3fffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*23+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w11 >> 32) | (w12 << 32) & 0x3fffffffffff, parm);\ - DST(op,i*32+17, (w12 >> 14) & 0x3fffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*23+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w12 >> 60) | (w13 << 4) & 0x3fffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*23+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w13 >> 42) | (w14 << 22) & 0x3fffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*23+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w14 >> 24) | (w15 << 40) & 0x3fffffffffff, parm);\ - DST(op,i*32+21, (w15 >> 6) & 0x3fffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*23+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w15 >> 52) | (w16 << 12) & 0x3fffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*23+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w16 >> 34) | (w17 << 30) & 0x3fffffffffff, parm);\ - DST(op,i*32+24, (w17 >> 16) & 0x3fffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*23+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w17 >> 62) | (w18 << 2) & 0x3fffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*23+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w18 >> 44) | (w19 << 20) & 0x3fffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*23+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w19 >> 26) | (w20 << 38) & 0x3fffffffffff, parm);\ - DST(op,i*32+28, (w20 >> 8) & 0x3fffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*23+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w20 >> 54) | (w21 << 10) & 0x3fffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*23+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w21 >> 36) | (w22 << 28) & 0x3fffffffffff, parm);\ - DST(op,i*32+31, (w22 >> 18) , parm);;\ -} - -#define BITUNPACK64_46(ip, op, parm) { \ - BITUNBLK64_46(ip, 0, op, parm); DSTI(op); ip += 46*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_47(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*47+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*47+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 47) | (w1 << 17) & 0x7fffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*47+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 30) | (w2 << 34) & 0x7fffffffffff, parm);\ - DST(op,i*64+ 3, (w2 >> 13) & 0x7fffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*47+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w2 >> 60) | (w3 << 4) & 0x7fffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*47+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w3 >> 43) | (w4 << 21) & 0x7fffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*47+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w4 >> 26) | (w5 << 38) & 0x7fffffffffff, parm);\ - DST(op,i*64+ 7, (w5 >> 9) & 0x7fffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*47+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w5 >> 56) | (w6 << 8) & 0x7fffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*47+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w6 >> 39) | (w7 << 25) & 0x7fffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*47+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w7 >> 22) | (w8 << 42) & 0x7fffffffffff, parm);\ - DST(op,i*64+11, (w8 >> 5) & 0x7fffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*47+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w8 >> 52) | (w9 << 12) & 0x7fffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*47+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w9 >> 35) | (w10 << 29) & 0x7fffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*47+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w10 >> 18) | (w11 << 46) & 0x7fffffffffff, parm);\ - DST(op,i*64+15, (w11 >> 1) & 0x7fffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*47+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w11 >> 48) | (w12 << 16) & 0x7fffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*47+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w12 >> 31) | (w13 << 33) & 0x7fffffffffff, parm);\ - DST(op,i*64+18, (w13 >> 14) & 0x7fffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*47+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w13 >> 61) | (w14 << 3) & 0x7fffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*47+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w14 >> 44) | (w15 << 20) & 0x7fffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*47+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w15 >> 27) | (w16 << 37) & 0x7fffffffffff, parm);\ - DST(op,i*64+22, (w16 >> 10) & 0x7fffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*47+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w16 >> 57) | (w17 << 7) & 0x7fffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*47+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w17 >> 40) | (w18 << 24) & 0x7fffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*47+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w18 >> 23) | (w19 << 41) & 0x7fffffffffff, parm);\ - DST(op,i*64+26, (w19 >> 6) & 0x7fffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*47+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w19 >> 53) | (w20 << 11) & 0x7fffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*47+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w20 >> 36) | (w21 << 28) & 0x7fffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*47+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w21 >> 19) | (w22 << 45) & 0x7fffffffffff, parm);\ - DST(op,i*64+30, (w22 >> 2) & 0x7fffffffffff, parm); register uint64_t w23 = *(uint32_t *)(ip+(i*47+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w22 >> 49) | (w23 << 15) & 0x7fffffffffff, parm);;\ -} - -#define BITUNPACK64_47(ip, op, parm) { \ - BITUNBLK64_47(ip, 0, op, parm); DSTI(op); ip += 47*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_48(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*3+0)*8/sizeof(ip[0]));\ - DST(op,i*4+ 0, (w0 ) & 0xffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*3+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*4+ 1, (w0 >> 48) | (w1 << 16) & 0xffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*3+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*4+ 2, (w1 >> 32) | (w2 << 32) & 0xffffffffffff, parm);\ - DST(op,i*4+ 3, (w2 >> 16) , parm);;\ -} - -#define BITUNPACK64_48(ip, op, parm) { \ - BITUNBLK64_48(ip, 0, op, parm);\ - BITUNBLK64_48(ip, 1, op, parm);\ - BITUNBLK64_48(ip, 2, op, parm);\ - BITUNBLK64_48(ip, 3, op, parm);\ - BITUNBLK64_48(ip, 4, op, parm);\ - BITUNBLK64_48(ip, 5, op, parm);\ - BITUNBLK64_48(ip, 6, op, parm);\ - BITUNBLK64_48(ip, 7, op, parm); DSTI(op); ip += 48*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_49(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*49+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*49+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 49) | (w1 << 15) & 0x1ffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*49+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 34) | (w2 << 30) & 0x1ffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*49+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 19) | (w3 << 45) & 0x1ffffffffffff, parm);\ - DST(op,i*64+ 4, (w3 >> 4) & 0x1ffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*49+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w3 >> 53) | (w4 << 11) & 0x1ffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*49+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w4 >> 38) | (w5 << 26) & 0x1ffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*49+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w5 >> 23) | (w6 << 41) & 0x1ffffffffffff, parm);\ - DST(op,i*64+ 8, (w6 >> 8) & 0x1ffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*49+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w6 >> 57) | (w7 << 7) & 0x1ffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*49+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w7 >> 42) | (w8 << 22) & 0x1ffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*49+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w8 >> 27) | (w9 << 37) & 0x1ffffffffffff, parm);\ - DST(op,i*64+12, (w9 >> 12) & 0x1ffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*49+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w9 >> 61) | (w10 << 3) & 0x1ffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*49+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w10 >> 46) | (w11 << 18) & 0x1ffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*49+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w11 >> 31) | (w12 << 33) & 0x1ffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*49+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w12 >> 16) | (w13 << 48) & 0x1ffffffffffff, parm);\ - DST(op,i*64+17, (w13 >> 1) & 0x1ffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*49+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w13 >> 50) | (w14 << 14) & 0x1ffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*49+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w14 >> 35) | (w15 << 29) & 0x1ffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*49+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w15 >> 20) | (w16 << 44) & 0x1ffffffffffff, parm);\ - DST(op,i*64+21, (w16 >> 5) & 0x1ffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*49+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w16 >> 54) | (w17 << 10) & 0x1ffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*49+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w17 >> 39) | (w18 << 25) & 0x1ffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*49+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w18 >> 24) | (w19 << 40) & 0x1ffffffffffff, parm);\ - DST(op,i*64+25, (w19 >> 9) & 0x1ffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*49+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w19 >> 58) | (w20 << 6) & 0x1ffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*49+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w20 >> 43) | (w21 << 21) & 0x1ffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*49+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w21 >> 28) | (w22 << 36) & 0x1ffffffffffff, parm);\ - DST(op,i*64+29, (w22 >> 13) & 0x1ffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*49+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w22 >> 62) | (w23 << 2) & 0x1ffffffffffff, parm); register uint64_t w24 = *(uint32_t *)(ip+(i*49+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w23 >> 47) | (w24 << 17) & 0x1ffffffffffff, parm);;\ -} - -#define BITUNPACK64_49(ip, op, parm) { \ - BITUNBLK64_49(ip, 0, op, parm); DSTI(op); ip += 49*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_50(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*25+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*25+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 50) | (w1 << 14) & 0x3ffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*25+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w1 >> 36) | (w2 << 28) & 0x3ffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*25+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w2 >> 22) | (w3 << 42) & 0x3ffffffffffff, parm);\ - DST(op,i*32+ 4, (w3 >> 8) & 0x3ffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*25+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w3 >> 58) | (w4 << 6) & 0x3ffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*25+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w4 >> 44) | (w5 << 20) & 0x3ffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*25+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w5 >> 30) | (w6 << 34) & 0x3ffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*25+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w6 >> 16) | (w7 << 48) & 0x3ffffffffffff, parm);\ - DST(op,i*32+ 9, (w7 >> 2) & 0x3ffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*25+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w7 >> 52) | (w8 << 12) & 0x3ffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*25+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w8 >> 38) | (w9 << 26) & 0x3ffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*25+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w9 >> 24) | (w10 << 40) & 0x3ffffffffffff, parm);\ - DST(op,i*32+13, (w10 >> 10) & 0x3ffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*25+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w10 >> 60) | (w11 << 4) & 0x3ffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*25+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w11 >> 46) | (w12 << 18) & 0x3ffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*25+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w12 >> 32) | (w13 << 32) & 0x3ffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*25+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w13 >> 18) | (w14 << 46) & 0x3ffffffffffff, parm);\ - DST(op,i*32+18, (w14 >> 4) & 0x3ffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*25+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w14 >> 54) | (w15 << 10) & 0x3ffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*25+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w15 >> 40) | (w16 << 24) & 0x3ffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*25+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w16 >> 26) | (w17 << 38) & 0x3ffffffffffff, parm);\ - DST(op,i*32+22, (w17 >> 12) & 0x3ffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*25+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w17 >> 62) | (w18 << 2) & 0x3ffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*25+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w18 >> 48) | (w19 << 16) & 0x3ffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*25+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w19 >> 34) | (w20 << 30) & 0x3ffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*25+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w20 >> 20) | (w21 << 44) & 0x3ffffffffffff, parm);\ - DST(op,i*32+27, (w21 >> 6) & 0x3ffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*25+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w21 >> 56) | (w22 << 8) & 0x3ffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*25+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w22 >> 42) | (w23 << 22) & 0x3ffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*25+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w23 >> 28) | (w24 << 36) & 0x3ffffffffffff, parm);\ - DST(op,i*32+31, (w24 >> 14) , parm);;\ -} - -#define BITUNPACK64_50(ip, op, parm) { \ - BITUNBLK64_50(ip, 0, op, parm); DSTI(op); ip += 50*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_51(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*51+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*51+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 51) | (w1 << 13) & 0x7ffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*51+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 38) | (w2 << 26) & 0x7ffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*51+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 25) | (w3 << 39) & 0x7ffffffffffff, parm);\ - DST(op,i*64+ 4, (w3 >> 12) & 0x7ffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*51+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w3 >> 63) | (w4 << 1) & 0x7ffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*51+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w4 >> 50) | (w5 << 14) & 0x7ffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*51+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w5 >> 37) | (w6 << 27) & 0x7ffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*51+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w6 >> 24) | (w7 << 40) & 0x7ffffffffffff, parm);\ - DST(op,i*64+ 9, (w7 >> 11) & 0x7ffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*51+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w7 >> 62) | (w8 << 2) & 0x7ffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*51+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w8 >> 49) | (w9 << 15) & 0x7ffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*51+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w9 >> 36) | (w10 << 28) & 0x7ffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*51+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w10 >> 23) | (w11 << 41) & 0x7ffffffffffff, parm);\ - DST(op,i*64+14, (w11 >> 10) & 0x7ffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*51+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w11 >> 61) | (w12 << 3) & 0x7ffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*51+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w12 >> 48) | (w13 << 16) & 0x7ffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*51+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w13 >> 35) | (w14 << 29) & 0x7ffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*51+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w14 >> 22) | (w15 << 42) & 0x7ffffffffffff, parm);\ - DST(op,i*64+19, (w15 >> 9) & 0x7ffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*51+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w15 >> 60) | (w16 << 4) & 0x7ffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*51+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w16 >> 47) | (w17 << 17) & 0x7ffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*51+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w17 >> 34) | (w18 << 30) & 0x7ffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*51+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w18 >> 21) | (w19 << 43) & 0x7ffffffffffff, parm);\ - DST(op,i*64+24, (w19 >> 8) & 0x7ffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*51+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w19 >> 59) | (w20 << 5) & 0x7ffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*51+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w20 >> 46) | (w21 << 18) & 0x7ffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*51+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w21 >> 33) | (w22 << 31) & 0x7ffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*51+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w22 >> 20) | (w23 << 44) & 0x7ffffffffffff, parm);\ - DST(op,i*64+29, (w23 >> 7) & 0x7ffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*51+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w23 >> 58) | (w24 << 6) & 0x7ffffffffffff, parm); register uint64_t w25 = *(uint32_t *)(ip+(i*51+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w24 >> 45) | (w25 << 19) & 0x7ffffffffffff, parm);;\ -} - -#define BITUNPACK64_51(ip, op, parm) { \ - BITUNBLK64_51(ip, 0, op, parm); DSTI(op); ip += 51*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_52(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*13+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*13+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 1, (w0 >> 52) | (w1 << 12) & 0xfffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*13+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 2, (w1 >> 40) | (w2 << 24) & 0xfffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*13+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 3, (w2 >> 28) | (w3 << 36) & 0xfffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*13+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 4, (w3 >> 16) | (w4 << 48) & 0xfffffffffffff, parm);\ - DST(op,i*16+ 5, (w4 >> 4) & 0xfffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*13+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 6, (w4 >> 56) | (w5 << 8) & 0xfffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*13+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 7, (w5 >> 44) | (w6 << 20) & 0xfffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*13+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 8, (w6 >> 32) | (w7 << 32) & 0xfffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*13+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 9, (w7 >> 20) | (w8 << 44) & 0xfffffffffffff, parm);\ - DST(op,i*16+10, (w8 >> 8) & 0xfffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*13+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+11, (w8 >> 60) | (w9 << 4) & 0xfffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*13+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+12, (w9 >> 48) | (w10 << 16) & 0xfffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*13+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+13, (w10 >> 36) | (w11 << 28) & 0xfffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*13+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+14, (w11 >> 24) | (w12 << 40) & 0xfffffffffffff, parm);\ - DST(op,i*16+15, (w12 >> 12) , parm);;\ -} - -#define BITUNPACK64_52(ip, op, parm) { \ - BITUNBLK64_52(ip, 0, op, parm);\ - BITUNBLK64_52(ip, 1, op, parm); DSTI(op); ip += 52*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_53(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*53+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*53+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 53) | (w1 << 11) & 0x1fffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*53+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 42) | (w2 << 22) & 0x1fffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*53+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 31) | (w3 << 33) & 0x1fffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*53+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w3 >> 20) | (w4 << 44) & 0x1fffffffffffff, parm);\ - DST(op,i*64+ 5, (w4 >> 9) & 0x1fffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*53+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w4 >> 62) | (w5 << 2) & 0x1fffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*53+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w5 >> 51) | (w6 << 13) & 0x1fffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*53+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w6 >> 40) | (w7 << 24) & 0x1fffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*53+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w7 >> 29) | (w8 << 35) & 0x1fffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*53+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w8 >> 18) | (w9 << 46) & 0x1fffffffffffff, parm);\ - DST(op,i*64+11, (w9 >> 7) & 0x1fffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*53+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w9 >> 60) | (w10 << 4) & 0x1fffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*53+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w10 >> 49) | (w11 << 15) & 0x1fffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*53+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w11 >> 38) | (w12 << 26) & 0x1fffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*53+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w12 >> 27) | (w13 << 37) & 0x1fffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*53+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w13 >> 16) | (w14 << 48) & 0x1fffffffffffff, parm);\ - DST(op,i*64+17, (w14 >> 5) & 0x1fffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*53+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w14 >> 58) | (w15 << 6) & 0x1fffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*53+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w15 >> 47) | (w16 << 17) & 0x1fffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*53+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w16 >> 36) | (w17 << 28) & 0x1fffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*53+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w17 >> 25) | (w18 << 39) & 0x1fffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*53+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w18 >> 14) | (w19 << 50) & 0x1fffffffffffff, parm);\ - DST(op,i*64+23, (w19 >> 3) & 0x1fffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*53+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w19 >> 56) | (w20 << 8) & 0x1fffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*53+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w20 >> 45) | (w21 << 19) & 0x1fffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*53+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w21 >> 34) | (w22 << 30) & 0x1fffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*53+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w22 >> 23) | (w23 << 41) & 0x1fffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*53+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w23 >> 12) | (w24 << 52) & 0x1fffffffffffff, parm);\ - DST(op,i*64+29, (w24 >> 1) & 0x1fffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*53+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w24 >> 54) | (w25 << 10) & 0x1fffffffffffff, parm); register uint64_t w26 = *(uint32_t *)(ip+(i*53+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w25 >> 43) | (w26 << 21) & 0x1fffffffffffff, parm);;\ -} - -#define BITUNPACK64_53(ip, op, parm) { \ - BITUNBLK64_53(ip, 0, op, parm); DSTI(op); ip += 53*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_54(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*27+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*27+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 54) | (w1 << 10) & 0x3fffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*27+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w1 >> 44) | (w2 << 20) & 0x3fffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*27+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w2 >> 34) | (w3 << 30) & 0x3fffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*27+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w3 >> 24) | (w4 << 40) & 0x3fffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*27+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w4 >> 14) | (w5 << 50) & 0x3fffffffffffff, parm);\ - DST(op,i*32+ 6, (w5 >> 4) & 0x3fffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*27+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w5 >> 58) | (w6 << 6) & 0x3fffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*27+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w6 >> 48) | (w7 << 16) & 0x3fffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*27+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w7 >> 38) | (w8 << 26) & 0x3fffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*27+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w8 >> 28) | (w9 << 36) & 0x3fffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*27+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w9 >> 18) | (w10 << 46) & 0x3fffffffffffff, parm);\ - DST(op,i*32+12, (w10 >> 8) & 0x3fffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*27+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w10 >> 62) | (w11 << 2) & 0x3fffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*27+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w11 >> 52) | (w12 << 12) & 0x3fffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*27+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w12 >> 42) | (w13 << 22) & 0x3fffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*27+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w13 >> 32) | (w14 << 32) & 0x3fffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*27+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w14 >> 22) | (w15 << 42) & 0x3fffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*27+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w15 >> 12) | (w16 << 52) & 0x3fffffffffffff, parm);\ - DST(op,i*32+19, (w16 >> 2) & 0x3fffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*27+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w16 >> 56) | (w17 << 8) & 0x3fffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*27+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w17 >> 46) | (w18 << 18) & 0x3fffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*27+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w18 >> 36) | (w19 << 28) & 0x3fffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*27+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w19 >> 26) | (w20 << 38) & 0x3fffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*27+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w20 >> 16) | (w21 << 48) & 0x3fffffffffffff, parm);\ - DST(op,i*32+25, (w21 >> 6) & 0x3fffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*27+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w21 >> 60) | (w22 << 4) & 0x3fffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*27+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w22 >> 50) | (w23 << 14) & 0x3fffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*27+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w23 >> 40) | (w24 << 24) & 0x3fffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*27+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w24 >> 30) | (w25 << 34) & 0x3fffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*27+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w25 >> 20) | (w26 << 44) & 0x3fffffffffffff, parm);\ - DST(op,i*32+31, (w26 >> 10) , parm);;\ -} - -#define BITUNPACK64_54(ip, op, parm) { \ - BITUNBLK64_54(ip, 0, op, parm); DSTI(op); ip += 54*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_55(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*55+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*55+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 55) | (w1 << 9) & 0x7fffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*55+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 46) | (w2 << 18) & 0x7fffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*55+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 37) | (w3 << 27) & 0x7fffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*55+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w3 >> 28) | (w4 << 36) & 0x7fffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*55+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w4 >> 19) | (w5 << 45) & 0x7fffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*55+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w5 >> 10) | (w6 << 54) & 0x7fffffffffffff, parm);\ - DST(op,i*64+ 7, (w6 >> 1) & 0x7fffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*55+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w6 >> 56) | (w7 << 8) & 0x7fffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*55+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w7 >> 47) | (w8 << 17) & 0x7fffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*55+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w8 >> 38) | (w9 << 26) & 0x7fffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*55+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w9 >> 29) | (w10 << 35) & 0x7fffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*55+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w10 >> 20) | (w11 << 44) & 0x7fffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*55+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w11 >> 11) | (w12 << 53) & 0x7fffffffffffff, parm);\ - DST(op,i*64+14, (w12 >> 2) & 0x7fffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*55+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w12 >> 57) | (w13 << 7) & 0x7fffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*55+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w13 >> 48) | (w14 << 16) & 0x7fffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*55+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w14 >> 39) | (w15 << 25) & 0x7fffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*55+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w15 >> 30) | (w16 << 34) & 0x7fffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*55+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w16 >> 21) | (w17 << 43) & 0x7fffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*55+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w17 >> 12) | (w18 << 52) & 0x7fffffffffffff, parm);\ - DST(op,i*64+21, (w18 >> 3) & 0x7fffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*55+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w18 >> 58) | (w19 << 6) & 0x7fffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*55+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w19 >> 49) | (w20 << 15) & 0x7fffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*55+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w20 >> 40) | (w21 << 24) & 0x7fffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*55+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w21 >> 31) | (w22 << 33) & 0x7fffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*55+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w22 >> 22) | (w23 << 42) & 0x7fffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*55+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w23 >> 13) | (w24 << 51) & 0x7fffffffffffff, parm);\ - DST(op,i*64+28, (w24 >> 4) & 0x7fffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*55+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w24 >> 59) | (w25 << 5) & 0x7fffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*55+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w25 >> 50) | (w26 << 14) & 0x7fffffffffffff, parm); register uint64_t w27 = *(uint32_t *)(ip+(i*55+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w26 >> 41) | (w27 << 23) & 0x7fffffffffffff, parm);;\ -} - -#define BITUNPACK64_55(ip, op, parm) { \ - BITUNBLK64_55(ip, 0, op, parm); DSTI(op); ip += 55*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_56(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*7+0)*8/sizeof(ip[0]));\ - DST(op,i*8+ 0, (w0 ) & 0xffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*7+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 1, (w0 >> 56) | (w1 << 8) & 0xffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*7+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 2, (w1 >> 48) | (w2 << 16) & 0xffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*7+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 3, (w2 >> 40) | (w3 << 24) & 0xffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*7+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 4, (w3 >> 32) | (w4 << 32) & 0xffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*7+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 5, (w4 >> 24) | (w5 << 40) & 0xffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*7+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*8+ 6, (w5 >> 16) | (w6 << 48) & 0xffffffffffffff, parm);\ - DST(op,i*8+ 7, (w6 >> 8) , parm);;\ -} - -#define BITUNPACK64_56(ip, op, parm) { \ - BITUNBLK64_56(ip, 0, op, parm);\ - BITUNBLK64_56(ip, 1, op, parm);\ - BITUNBLK64_56(ip, 2, op, parm);\ - BITUNBLK64_56(ip, 3, op, parm); DSTI(op); ip += 56*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_57(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*57+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1ffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*57+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 57) | (w1 << 7) & 0x1ffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*57+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 50) | (w2 << 14) & 0x1ffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*57+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 43) | (w3 << 21) & 0x1ffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*57+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w3 >> 36) | (w4 << 28) & 0x1ffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*57+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w4 >> 29) | (w5 << 35) & 0x1ffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*57+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w5 >> 22) | (w6 << 42) & 0x1ffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*57+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w6 >> 15) | (w7 << 49) & 0x1ffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*57+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w7 >> 8) | (w8 << 56) & 0x1ffffffffffffff, parm);\ - DST(op,i*64+ 9, (w8 >> 1) & 0x1ffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*57+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w8 >> 58) | (w9 << 6) & 0x1ffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*57+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w9 >> 51) | (w10 << 13) & 0x1ffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*57+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w10 >> 44) | (w11 << 20) & 0x1ffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*57+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w11 >> 37) | (w12 << 27) & 0x1ffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*57+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w12 >> 30) | (w13 << 34) & 0x1ffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*57+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w13 >> 23) | (w14 << 41) & 0x1ffffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*57+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w14 >> 16) | (w15 << 48) & 0x1ffffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*57+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w15 >> 9) | (w16 << 55) & 0x1ffffffffffffff, parm);\ - DST(op,i*64+18, (w16 >> 2) & 0x1ffffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*57+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w16 >> 59) | (w17 << 5) & 0x1ffffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*57+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w17 >> 52) | (w18 << 12) & 0x1ffffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*57+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w18 >> 45) | (w19 << 19) & 0x1ffffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*57+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w19 >> 38) | (w20 << 26) & 0x1ffffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*57+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w20 >> 31) | (w21 << 33) & 0x1ffffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*57+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w21 >> 24) | (w22 << 40) & 0x1ffffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*57+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w22 >> 17) | (w23 << 47) & 0x1ffffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*57+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w23 >> 10) | (w24 << 54) & 0x1ffffffffffffff, parm);\ - DST(op,i*64+27, (w24 >> 3) & 0x1ffffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*57+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w24 >> 60) | (w25 << 4) & 0x1ffffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*57+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w25 >> 53) | (w26 << 11) & 0x1ffffffffffffff, parm); register uint64_t w27 = *(uint64_t *)(ip+(i*57+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w26 >> 46) | (w27 << 18) & 0x1ffffffffffffff, parm); register uint64_t w28 = *(uint32_t *)(ip+(i*57+28)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w27 >> 39) | (w28 << 25) & 0x1ffffffffffffff, parm);;\ -} - -#define BITUNPACK64_57(ip, op, parm) { \ - BITUNBLK64_57(ip, 0, op, parm); DSTI(op); ip += 57*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_58(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*29+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3ffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*29+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 58) | (w1 << 6) & 0x3ffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*29+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w1 >> 52) | (w2 << 12) & 0x3ffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*29+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w2 >> 46) | (w3 << 18) & 0x3ffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*29+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w3 >> 40) | (w4 << 24) & 0x3ffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*29+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w4 >> 34) | (w5 << 30) & 0x3ffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*29+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w5 >> 28) | (w6 << 36) & 0x3ffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*29+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w6 >> 22) | (w7 << 42) & 0x3ffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*29+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w7 >> 16) | (w8 << 48) & 0x3ffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*29+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w8 >> 10) | (w9 << 54) & 0x3ffffffffffffff, parm);\ - DST(op,i*32+10, (w9 >> 4) & 0x3ffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*29+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w9 >> 62) | (w10 << 2) & 0x3ffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*29+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w10 >> 56) | (w11 << 8) & 0x3ffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*29+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w11 >> 50) | (w12 << 14) & 0x3ffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*29+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w12 >> 44) | (w13 << 20) & 0x3ffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*29+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w13 >> 38) | (w14 << 26) & 0x3ffffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*29+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w14 >> 32) | (w15 << 32) & 0x3ffffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*29+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w15 >> 26) | (w16 << 38) & 0x3ffffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*29+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w16 >> 20) | (w17 << 44) & 0x3ffffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*29+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w17 >> 14) | (w18 << 50) & 0x3ffffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*29+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w18 >> 8) | (w19 << 56) & 0x3ffffffffffffff, parm);\ - DST(op,i*32+21, (w19 >> 2) & 0x3ffffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*29+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w19 >> 60) | (w20 << 4) & 0x3ffffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*29+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w20 >> 54) | (w21 << 10) & 0x3ffffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*29+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w21 >> 48) | (w22 << 16) & 0x3ffffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*29+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w22 >> 42) | (w23 << 22) & 0x3ffffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*29+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w23 >> 36) | (w24 << 28) & 0x3ffffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*29+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w24 >> 30) | (w25 << 34) & 0x3ffffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*29+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w25 >> 24) | (w26 << 40) & 0x3ffffffffffffff, parm); register uint64_t w27 = *(uint64_t *)(ip+(i*29+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w26 >> 18) | (w27 << 46) & 0x3ffffffffffffff, parm); register uint64_t w28 = *(uint64_t *)(ip+(i*29+28)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w27 >> 12) | (w28 << 52) & 0x3ffffffffffffff, parm);\ - DST(op,i*32+31, (w28 >> 6) , parm);;\ -} - -#define BITUNPACK64_58(ip, op, parm) { \ - BITUNBLK64_58(ip, 0, op, parm); DSTI(op); ip += 58*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_59(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*59+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7ffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*59+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 59) | (w1 << 5) & 0x7ffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*59+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 54) | (w2 << 10) & 0x7ffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*59+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 49) | (w3 << 15) & 0x7ffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*59+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w3 >> 44) | (w4 << 20) & 0x7ffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*59+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w4 >> 39) | (w5 << 25) & 0x7ffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*59+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w5 >> 34) | (w6 << 30) & 0x7ffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*59+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w6 >> 29) | (w7 << 35) & 0x7ffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*59+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w7 >> 24) | (w8 << 40) & 0x7ffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*59+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w8 >> 19) | (w9 << 45) & 0x7ffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*59+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w9 >> 14) | (w10 << 50) & 0x7ffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*59+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w10 >> 9) | (w11 << 55) & 0x7ffffffffffffff, parm);\ - DST(op,i*64+12, (w11 >> 4) & 0x7ffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*59+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w11 >> 63) | (w12 << 1) & 0x7ffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*59+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w12 >> 58) | (w13 << 6) & 0x7ffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*59+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w13 >> 53) | (w14 << 11) & 0x7ffffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*59+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w14 >> 48) | (w15 << 16) & 0x7ffffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*59+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w15 >> 43) | (w16 << 21) & 0x7ffffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*59+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w16 >> 38) | (w17 << 26) & 0x7ffffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*59+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w17 >> 33) | (w18 << 31) & 0x7ffffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*59+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w18 >> 28) | (w19 << 36) & 0x7ffffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*59+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w19 >> 23) | (w20 << 41) & 0x7ffffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*59+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w20 >> 18) | (w21 << 46) & 0x7ffffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*59+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w21 >> 13) | (w22 << 51) & 0x7ffffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*59+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w22 >> 8) | (w23 << 56) & 0x7ffffffffffffff, parm);\ - DST(op,i*64+25, (w23 >> 3) & 0x7ffffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*59+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w23 >> 62) | (w24 << 2) & 0x7ffffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*59+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w24 >> 57) | (w25 << 7) & 0x7ffffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*59+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w25 >> 52) | (w26 << 12) & 0x7ffffffffffffff, parm); register uint64_t w27 = *(uint64_t *)(ip+(i*59+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w26 >> 47) | (w27 << 17) & 0x7ffffffffffffff, parm); register uint64_t w28 = *(uint64_t *)(ip+(i*59+28)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w27 >> 42) | (w28 << 22) & 0x7ffffffffffffff, parm); register uint64_t w29 = *(uint32_t *)(ip+(i*59+29)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w28 >> 37) | (w29 << 27) & 0x7ffffffffffffff, parm);;\ -} - -#define BITUNPACK64_59(ip, op, parm) { \ - BITUNBLK64_59(ip, 0, op, parm); DSTI(op); ip += 59*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_60(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*15+0)*8/sizeof(ip[0]));\ - DST(op,i*16+ 0, (w0 ) & 0xfffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*15+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 1, (w0 >> 60) | (w1 << 4) & 0xfffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*15+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 2, (w1 >> 56) | (w2 << 8) & 0xfffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*15+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 3, (w2 >> 52) | (w3 << 12) & 0xfffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*15+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 4, (w3 >> 48) | (w4 << 16) & 0xfffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*15+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 5, (w4 >> 44) | (w5 << 20) & 0xfffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*15+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 6, (w5 >> 40) | (w6 << 24) & 0xfffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*15+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 7, (w6 >> 36) | (w7 << 28) & 0xfffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*15+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 8, (w7 >> 32) | (w8 << 32) & 0xfffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*15+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+ 9, (w8 >> 28) | (w9 << 36) & 0xfffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*15+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+10, (w9 >> 24) | (w10 << 40) & 0xfffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*15+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+11, (w10 >> 20) | (w11 << 44) & 0xfffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*15+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+12, (w11 >> 16) | (w12 << 48) & 0xfffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*15+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+13, (w12 >> 12) | (w13 << 52) & 0xfffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*15+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*16+14, (w13 >> 8) | (w14 << 56) & 0xfffffffffffffff, parm);\ - DST(op,i*16+15, (w14 >> 4) , parm);;\ -} - -#define BITUNPACK64_60(ip, op, parm) { \ - BITUNBLK64_60(ip, 0, op, parm);\ - BITUNBLK64_60(ip, 1, op, parm); DSTI(op); ip += 60*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_61(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*61+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x1fffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*61+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 61) | (w1 << 3) & 0x1fffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*61+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 58) | (w2 << 6) & 0x1fffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*61+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 55) | (w3 << 9) & 0x1fffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*61+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w3 >> 52) | (w4 << 12) & 0x1fffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*61+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w4 >> 49) | (w5 << 15) & 0x1fffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*61+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w5 >> 46) | (w6 << 18) & 0x1fffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*61+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w6 >> 43) | (w7 << 21) & 0x1fffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*61+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w7 >> 40) | (w8 << 24) & 0x1fffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*61+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w8 >> 37) | (w9 << 27) & 0x1fffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*61+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w9 >> 34) | (w10 << 30) & 0x1fffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*61+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w10 >> 31) | (w11 << 33) & 0x1fffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*61+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w11 >> 28) | (w12 << 36) & 0x1fffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*61+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w12 >> 25) | (w13 << 39) & 0x1fffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*61+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w13 >> 22) | (w14 << 42) & 0x1fffffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*61+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w14 >> 19) | (w15 << 45) & 0x1fffffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*61+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w15 >> 16) | (w16 << 48) & 0x1fffffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*61+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w16 >> 13) | (w17 << 51) & 0x1fffffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*61+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w17 >> 10) | (w18 << 54) & 0x1fffffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*61+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w18 >> 7) | (w19 << 57) & 0x1fffffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*61+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w19 >> 4) | (w20 << 60) & 0x1fffffffffffffff, parm);\ - DST(op,i*64+21, (w20 >> 1) & 0x1fffffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*61+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w20 >> 62) | (w21 << 2) & 0x1fffffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*61+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w21 >> 59) | (w22 << 5) & 0x1fffffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*61+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w22 >> 56) | (w23 << 8) & 0x1fffffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*61+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w23 >> 53) | (w24 << 11) & 0x1fffffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*61+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w24 >> 50) | (w25 << 14) & 0x1fffffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*61+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w25 >> 47) | (w26 << 17) & 0x1fffffffffffffff, parm); register uint64_t w27 = *(uint64_t *)(ip+(i*61+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w26 >> 44) | (w27 << 20) & 0x1fffffffffffffff, parm); register uint64_t w28 = *(uint64_t *)(ip+(i*61+28)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w27 >> 41) | (w28 << 23) & 0x1fffffffffffffff, parm); register uint64_t w29 = *(uint64_t *)(ip+(i*61+29)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w28 >> 38) | (w29 << 26) & 0x1fffffffffffffff, parm); register uint64_t w30 = *(uint32_t *)(ip+(i*61+30)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w29 >> 35) | (w30 << 29) & 0x1fffffffffffffff, parm);;\ -} - -#define BITUNPACK64_61(ip, op, parm) { \ - BITUNBLK64_61(ip, 0, op, parm); DSTI(op); ip += 61*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_62(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*31+0)*8/sizeof(ip[0]));\ - DST(op,i*32+ 0, (w0 ) & 0x3fffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*31+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 1, (w0 >> 62) | (w1 << 2) & 0x3fffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*31+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 2, (w1 >> 60) | (w2 << 4) & 0x3fffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*31+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 3, (w2 >> 58) | (w3 << 6) & 0x3fffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*31+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 4, (w3 >> 56) | (w4 << 8) & 0x3fffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*31+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 5, (w4 >> 54) | (w5 << 10) & 0x3fffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*31+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 6, (w5 >> 52) | (w6 << 12) & 0x3fffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*31+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 7, (w6 >> 50) | (w7 << 14) & 0x3fffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*31+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 8, (w7 >> 48) | (w8 << 16) & 0x3fffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*31+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+ 9, (w8 >> 46) | (w9 << 18) & 0x3fffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*31+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+10, (w9 >> 44) | (w10 << 20) & 0x3fffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*31+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+11, (w10 >> 42) | (w11 << 22) & 0x3fffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*31+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+12, (w11 >> 40) | (w12 << 24) & 0x3fffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*31+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+13, (w12 >> 38) | (w13 << 26) & 0x3fffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*31+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+14, (w13 >> 36) | (w14 << 28) & 0x3fffffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*31+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+15, (w14 >> 34) | (w15 << 30) & 0x3fffffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*31+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+16, (w15 >> 32) | (w16 << 32) & 0x3fffffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*31+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+17, (w16 >> 30) | (w17 << 34) & 0x3fffffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*31+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+18, (w17 >> 28) | (w18 << 36) & 0x3fffffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*31+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+19, (w18 >> 26) | (w19 << 38) & 0x3fffffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*31+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+20, (w19 >> 24) | (w20 << 40) & 0x3fffffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*31+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+21, (w20 >> 22) | (w21 << 42) & 0x3fffffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*31+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+22, (w21 >> 20) | (w22 << 44) & 0x3fffffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*31+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+23, (w22 >> 18) | (w23 << 46) & 0x3fffffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*31+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+24, (w23 >> 16) | (w24 << 48) & 0x3fffffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*31+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+25, (w24 >> 14) | (w25 << 50) & 0x3fffffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*31+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+26, (w25 >> 12) | (w26 << 52) & 0x3fffffffffffffff, parm); register uint64_t w27 = *(uint64_t *)(ip+(i*31+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+27, (w26 >> 10) | (w27 << 54) & 0x3fffffffffffffff, parm); register uint64_t w28 = *(uint64_t *)(ip+(i*31+28)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+28, (w27 >> 8) | (w28 << 56) & 0x3fffffffffffffff, parm); register uint64_t w29 = *(uint64_t *)(ip+(i*31+29)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+29, (w28 >> 6) | (w29 << 58) & 0x3fffffffffffffff, parm); register uint64_t w30 = *(uint64_t *)(ip+(i*31+30)*8/sizeof(ip[0]));\ -\ - DST(op,i*32+30, (w29 >> 4) | (w30 << 60) & 0x3fffffffffffffff, parm);\ - DST(op,i*32+31, (w30 >> 2) , parm);;\ -} - -#define BITUNPACK64_62(ip, op, parm) { \ - BITUNBLK64_62(ip, 0, op, parm); DSTI(op); ip += 62*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_63(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*63+0)*8/sizeof(ip[0]));\ - DST(op,i*64+ 0, (w0 ) & 0x7fffffffffffffff, parm); register uint64_t w1 = *(uint64_t *)(ip+(i*63+1)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 1, (w0 >> 63) | (w1 << 1) & 0x7fffffffffffffff, parm); register uint64_t w2 = *(uint64_t *)(ip+(i*63+2)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 2, (w1 >> 62) | (w2 << 2) & 0x7fffffffffffffff, parm); register uint64_t w3 = *(uint64_t *)(ip+(i*63+3)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 3, (w2 >> 61) | (w3 << 3) & 0x7fffffffffffffff, parm); register uint64_t w4 = *(uint64_t *)(ip+(i*63+4)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 4, (w3 >> 60) | (w4 << 4) & 0x7fffffffffffffff, parm); register uint64_t w5 = *(uint64_t *)(ip+(i*63+5)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 5, (w4 >> 59) | (w5 << 5) & 0x7fffffffffffffff, parm); register uint64_t w6 = *(uint64_t *)(ip+(i*63+6)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 6, (w5 >> 58) | (w6 << 6) & 0x7fffffffffffffff, parm); register uint64_t w7 = *(uint64_t *)(ip+(i*63+7)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 7, (w6 >> 57) | (w7 << 7) & 0x7fffffffffffffff, parm); register uint64_t w8 = *(uint64_t *)(ip+(i*63+8)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 8, (w7 >> 56) | (w8 << 8) & 0x7fffffffffffffff, parm); register uint64_t w9 = *(uint64_t *)(ip+(i*63+9)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+ 9, (w8 >> 55) | (w9 << 9) & 0x7fffffffffffffff, parm); register uint64_t w10 = *(uint64_t *)(ip+(i*63+10)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+10, (w9 >> 54) | (w10 << 10) & 0x7fffffffffffffff, parm); register uint64_t w11 = *(uint64_t *)(ip+(i*63+11)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+11, (w10 >> 53) | (w11 << 11) & 0x7fffffffffffffff, parm); register uint64_t w12 = *(uint64_t *)(ip+(i*63+12)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+12, (w11 >> 52) | (w12 << 12) & 0x7fffffffffffffff, parm); register uint64_t w13 = *(uint64_t *)(ip+(i*63+13)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+13, (w12 >> 51) | (w13 << 13) & 0x7fffffffffffffff, parm); register uint64_t w14 = *(uint64_t *)(ip+(i*63+14)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+14, (w13 >> 50) | (w14 << 14) & 0x7fffffffffffffff, parm); register uint64_t w15 = *(uint64_t *)(ip+(i*63+15)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+15, (w14 >> 49) | (w15 << 15) & 0x7fffffffffffffff, parm); register uint64_t w16 = *(uint64_t *)(ip+(i*63+16)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+16, (w15 >> 48) | (w16 << 16) & 0x7fffffffffffffff, parm); register uint64_t w17 = *(uint64_t *)(ip+(i*63+17)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+17, (w16 >> 47) | (w17 << 17) & 0x7fffffffffffffff, parm); register uint64_t w18 = *(uint64_t *)(ip+(i*63+18)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+18, (w17 >> 46) | (w18 << 18) & 0x7fffffffffffffff, parm); register uint64_t w19 = *(uint64_t *)(ip+(i*63+19)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+19, (w18 >> 45) | (w19 << 19) & 0x7fffffffffffffff, parm); register uint64_t w20 = *(uint64_t *)(ip+(i*63+20)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+20, (w19 >> 44) | (w20 << 20) & 0x7fffffffffffffff, parm); register uint64_t w21 = *(uint64_t *)(ip+(i*63+21)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+21, (w20 >> 43) | (w21 << 21) & 0x7fffffffffffffff, parm); register uint64_t w22 = *(uint64_t *)(ip+(i*63+22)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+22, (w21 >> 42) | (w22 << 22) & 0x7fffffffffffffff, parm); register uint64_t w23 = *(uint64_t *)(ip+(i*63+23)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+23, (w22 >> 41) | (w23 << 23) & 0x7fffffffffffffff, parm); register uint64_t w24 = *(uint64_t *)(ip+(i*63+24)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+24, (w23 >> 40) | (w24 << 24) & 0x7fffffffffffffff, parm); register uint64_t w25 = *(uint64_t *)(ip+(i*63+25)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+25, (w24 >> 39) | (w25 << 25) & 0x7fffffffffffffff, parm); register uint64_t w26 = *(uint64_t *)(ip+(i*63+26)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+26, (w25 >> 38) | (w26 << 26) & 0x7fffffffffffffff, parm); register uint64_t w27 = *(uint64_t *)(ip+(i*63+27)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+27, (w26 >> 37) | (w27 << 27) & 0x7fffffffffffffff, parm); register uint64_t w28 = *(uint64_t *)(ip+(i*63+28)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+28, (w27 >> 36) | (w28 << 28) & 0x7fffffffffffffff, parm); register uint64_t w29 = *(uint64_t *)(ip+(i*63+29)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+29, (w28 >> 35) | (w29 << 29) & 0x7fffffffffffffff, parm); register uint64_t w30 = *(uint64_t *)(ip+(i*63+30)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+30, (w29 >> 34) | (w30 << 30) & 0x7fffffffffffffff, parm); register uint64_t w31 = *(uint32_t *)(ip+(i*63+31)*8/sizeof(ip[0]));\ -\ - DST(op,i*64+31, (w30 >> 33) | (w31 << 31) & 0x7fffffffffffffff, parm);;\ -} - -#define BITUNPACK64_63(ip, op, parm) { \ - BITUNBLK64_63(ip, 0, op, parm); DSTI(op); ip += 63*4/sizeof(ip[0]);\ -} - -#define BITUNBLK64_64(ip, i, op, parm) { register uint64_t w0 = *(uint64_t *)(ip+(i*1+0)*8/sizeof(ip[0]));\ - DST(op,i*1+ 0, (w0 ) , parm);;\ -} - -#define BITUNPACK64_64(ip, op, parm) { \ - BITUNBLK64_64(ip, 0, op, parm);\ - BITUNBLK64_64(ip, 1, op, parm);\ - BITUNBLK64_64(ip, 2, op, parm);\ - BITUNBLK64_64(ip, 3, op, parm);\ - BITUNBLK64_64(ip, 4, op, parm);\ - BITUNBLK64_64(ip, 5, op, parm);\ - BITUNBLK64_64(ip, 6, op, parm);\ - BITUNBLK64_64(ip, 7, op, parm);\ - BITUNBLK64_64(ip, 8, op, parm);\ - BITUNBLK64_64(ip, 9, op, parm);\ - BITUNBLK64_64(ip, 10, op, parm);\ - BITUNBLK64_64(ip, 11, op, parm);\ - BITUNBLK64_64(ip, 12, op, parm);\ - BITUNBLK64_64(ip, 13, op, parm);\ - BITUNBLK64_64(ip, 14, op, parm);\ - BITUNBLK64_64(ip, 15, op, parm);\ - BITUNBLK64_64(ip, 16, op, parm);\ - BITUNBLK64_64(ip, 17, op, parm);\ - BITUNBLK64_64(ip, 18, op, parm);\ - BITUNBLK64_64(ip, 19, op, parm);\ - BITUNBLK64_64(ip, 20, op, parm);\ - BITUNBLK64_64(ip, 21, op, parm);\ - BITUNBLK64_64(ip, 22, op, parm);\ - BITUNBLK64_64(ip, 23, op, parm);\ - BITUNBLK64_64(ip, 24, op, parm);\ - BITUNBLK64_64(ip, 25, op, parm);\ - BITUNBLK64_64(ip, 26, op, parm);\ - BITUNBLK64_64(ip, 27, op, parm);\ - BITUNBLK64_64(ip, 28, op, parm);\ - BITUNBLK64_64(ip, 29, op, parm);\ - BITUNBLK64_64(ip, 30, op, parm);\ - BITUNBLK64_64(ip, 31, op, parm); DSTI(op); ip += 64*4/sizeof(ip[0]);\ -} - diff --git a/bitunpackv.c b/bitunpackv.c deleted file mode 100644 index d03ffdf..0000000 --- a/bitunpackv.c +++ /dev/null @@ -1,637 +0,0 @@ -/** - Copyright (C) powturbo 2013-2017 - GPL v2 License - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - - - homepage : https://sites.google.com/site/powturbo/ - - github : https://github.com/powturbo - - twitter : https://twitter.com/powturbo - - email : powturbo [_AT_] gmail [_DOT_] com -**/ -// "Integer Compression" SIMD Bit Packing -#include -#include -#include "conf.h" -#include "bitutil.h" -#include "bitpack.h" - -#define PAD8(__x) (((__x)+7)/8) - -//----------------------------------------------------------------------------- -#define VSTO( _op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, ov) -#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) -#include "bitunpack128v_.h" - -#define BITUNBLK128V32_0(ip, _i_, _op_, _parm_) {__m128i ov;\ - VSTO0(_op_, 0, ov, _parm_);\ - VSTO0(_op_, 1, ov, _parm_);\ - VSTO0(_op_, 2, ov, _parm_);\ - VSTO0(_op_, 3, ov, _parm_);\ - VSTO0(_op_, 4, ov, _parm_);\ - VSTO0(_op_, 5, ov, _parm_);\ - VSTO0(_op_, 6, ov, _parm_);\ - VSTO0(_op_, 7, ov, _parm_);\ - VSTO0(_op_, 8, ov, _parm_);\ - VSTO0(_op_, 9, ov, _parm_);\ - VSTO0(_op_, 10, ov, _parm_);\ - VSTO0(_op_, 11, ov, _parm_);\ - VSTO0(_op_, 12, ov, _parm_);\ - VSTO0(_op_, 13, ov, _parm_);\ - VSTO0(_op_, 14, ov, _parm_);\ - VSTO0(_op_, 15, ov, _parm_);\ - VSTO0(_op_, 16, ov, _parm_);\ - VSTO0(_op_, 17, ov, _parm_);\ - VSTO0(_op_, 18, ov, _parm_);\ - VSTO0(_op_, 19, ov, _parm_);\ - VSTO0(_op_, 20, ov, _parm_);\ - VSTO0(_op_, 21, ov, _parm_);\ - VSTO0(_op_, 22, ov, _parm_);\ - VSTO0(_op_, 23, ov, _parm_);\ - VSTO0(_op_, 24, ov, _parm_);\ - VSTO0(_op_, 25, ov, _parm_);\ - VSTO0(_op_, 26, ov, _parm_);\ - VSTO0(_op_, 27, ov, _parm_);\ - VSTO0(_op_, 28, ov, _parm_);\ - VSTO0(_op_, 29, ov, _parm_);\ - VSTO0(_op_, 30, ov, _parm_);\ - VSTO0(_op_, 31, ov, _parm_);\ -} -#define BITUNPACK0(_parm_) _parm_ = _mm_setzero_si128() - -unsigned char *bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); - __m128i sv; - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -//----------------------------------------------------------------------------- - #ifdef __SSSE3__ -#include -static ALIGNED(char, shuffles[16][16], 16) = { - #define _ 0x80 - { _,_,_,_, _,_,_,_, _,_, _, _, _, _, _,_ }, - { 0,1,2,3, _,_,_,_, _,_, _, _, _, _, _,_ }, - { _,_,_,_, 0,1,2,3, _,_, _, _, _, _, _,_ }, - { 0,1,2,3, 4,5,6,7, _,_, _, _, _, _, _,_ }, - { _,_,_,_, _,_,_,_, 0,1, 2, 3, _, _, _,_ }, - { 0,1,2,3, _,_,_,_, 4,5, 6, 7, _, _, _,_ }, - { _,_,_,_, 0,1,2,3, 4,5, 6, 7, _, _, _,_ }, - { 0,1,2,3, 4,5,6,7, 8,9,10,11, _, _, _,_ }, - { _,_,_,_, _,_,_,_, _,_,_,_, 0, 1, 2, 3 }, - { 0,1,2,3, _,_,_,_, _,_,_, _, 4, 5, 6, 7 }, - { _,_,_,_, 0,1,2,3, _,_,_, _, 4, 5, 6, 7 }, - { 0,1,2,3, 4,5,6,7, _,_, _, _, 8, 9,10,11 }, - { _,_,_,_, _,_,_,_, 0,1, 2, 3, 4, 5, 6, 7 }, - { 0,1,2,3, _,_,_,_, 4,5, 6, 7, 8, 9,10,11 }, - { _,_,_,_, 0,1,2,3, 4,5, 6, 7, 8, 9,10,11 }, - { 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, - #undef _ -}; - -#define VSTO( _op_, _i_, _ov_, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) )); pex += popcnt32(m) -#define VSTO0(_op_, _i_, ov, _parm_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _mm_storeu_si128(_op_++, _mm_shuffle_epi8( _mm_loadu_si128((__m128i*)pex), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) -#define BITUNPACK0(_parm_) //_parm_ = _mm_setzero_si128() -#include "bitunpack128v_.h" - -unsigned char *_bitunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; - __m128i sv; - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - #endif - -//----------------------------------------------------------------------------- -#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_) -#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG128x32(__ov); SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) -#include "bitunpack128v_.h" - -#define BITUNPACK0(_parm_) - -unsigned char *bitzunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); - __m128i sv = _mm_set1_epi32(start); - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCAN128x32(__ov,__sv); _mm_storeu_si128(__op++, __sv) -#include "bitunpack128v_.h" - -#define BITUNPACK0(_parm_) - -unsigned char *bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); - __m128i sv = _mm_set1_epi32(start); - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- - #ifdef __SSSE3__ -#define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); - -#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCAN128x32(_ov_,_sv_); _mm_storeu_si128(_op_++, _sv_); - -#include "bitunpack128v_.h" - -#define BITUNPACK0(_parm_) - -unsigned char *_bitdunpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; - __m128i sv = _mm_set1_epi32(start); - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - #endif -//----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCANI128x32(__ov,__sv,cv); _mm_storeu_si128(__op++, __sv); -#define VSTO0(_op_, _i_, ov, _parm_) _mm_storeu_si128(_op_++, _parm_); _parm_ = _mm_add_epi32(_parm_, cv) -#include "bitunpack128v_.h" - -#define BITUNPACK0(_parm_) _parm_ = _mm_add_epi32(_parm_, cv); cv = _mm_set1_epi32(4) - -unsigned char *bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(128*b); - __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -//----------------------------------------------------------------------------- - #ifdef __SSSE3__ -#define VEXP(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_add_epi32(_ov_, _mm_shuffle_epi8(_mm_slli_epi32(_mm_loadu_si128((__m128i*)pex), b), _mm_load_si128((__m128i*)shuffles[m]) ) ); pex += popcnt32(m) -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); - -#define VEXP0(_i_, _ov_) if(!((_i_) & 1)) m = (*bb) & 0xf;else m = (*bb++) >> 4; _ov_ = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pex),_mm_load_si128((__m128i*)shuffles[m]) ); pex += popcnt32(m) -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0( _i_, _ov_); SCANI128x32(_ov_,_sv_,cv); _mm_storeu_si128(_op_++, _sv_); - -#include "bitunpack128v_.h" - -#define BITUNPACK0(_parm_) mv = _mm_set1_epi32(0) //_parm_ = _mm_setzero_si128() - -unsigned char *_bitd1unpack128v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(128*b); unsigned m; - __m128i sv = _mm_set1_epi32(start), cv = _mm_set_epi32(4,3,2,1); - BITUNPACK128V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - #endif - - #ifdef __AVX2__ -#include - - #ifdef __AVX512F__ -#define mm256_maskz_expand_epi32(_m_,_v_) _mm256_maskz_expand_epi32(_m_,_v_) -#define mm256_maskz_loadu_epi32( _m_,_v_) _mm256_maskz_loadu_epi32( _m_,_v_) - #else -static unsigned char permv[256][8] __attribute__((aligned(32))) = { -0,0,0,0,0,0,0,0, -0,1,1,1,1,1,1,1, -1,0,1,1,1,1,1,1, -0,1,2,2,2,2,2,2, -1,1,0,1,1,1,1,1, -0,2,1,2,2,2,2,2, -2,0,1,2,2,2,2,2, -0,1,2,3,3,3,3,3, -1,1,1,0,1,1,1,1, -0,2,2,1,2,2,2,2, -2,0,2,1,2,2,2,2, -0,1,3,2,3,3,3,3, -2,2,0,1,2,2,2,2, -0,3,1,2,3,3,3,3, -3,0,1,2,3,3,3,3, -0,1,2,3,4,4,4,4, -1,1,1,1,0,1,1,1, -0,2,2,2,1,2,2,2, -2,0,2,2,1,2,2,2, -0,1,3,3,2,3,3,3, -2,2,0,2,1,2,2,2, -0,3,1,3,2,3,3,3, -3,0,1,3,2,3,3,3, -0,1,2,4,3,4,4,4, -2,2,2,0,1,2,2,2, -0,3,3,1,2,3,3,3, -3,0,3,1,2,3,3,3, -0,1,4,2,3,4,4,4, -3,3,0,1,2,3,3,3, -0,4,1,2,3,4,4,4, -4,0,1,2,3,4,4,4, -0,1,2,3,4,5,5,5, -1,1,1,1,1,0,1,1, -0,2,2,2,2,1,2,2, -2,0,2,2,2,1,2,2, -0,1,3,3,3,2,3,3, -2,2,0,2,2,1,2,2, -0,3,1,3,3,2,3,3, -3,0,1,3,3,2,3,3, -0,1,2,4,4,3,4,4, -2,2,2,0,2,1,2,2, -0,3,3,1,3,2,3,3, -3,0,3,1,3,2,3,3, -0,1,4,2,4,3,4,4, -3,3,0,1,3,2,3,3, -0,4,1,2,4,3,4,4, -4,0,1,2,4,3,4,4, -0,1,2,3,5,4,5,5, -2,2,2,2,0,1,2,2, -0,3,3,3,1,2,3,3, -3,0,3,3,1,2,3,3, -0,1,4,4,2,3,4,4, -3,3,0,3,1,2,3,3, -0,4,1,4,2,3,4,4, -4,0,1,4,2,3,4,4, -0,1,2,5,3,4,5,5, -3,3,3,0,1,2,3,3, -0,4,4,1,2,3,4,4, -4,0,4,1,2,3,4,4, -0,1,5,2,3,4,5,5, -4,4,0,1,2,3,4,4, -0,5,1,2,3,4,5,5, -5,0,1,2,3,4,5,5, -0,1,2,3,4,5,6,6, -1,1,1,1,1,1,0,1, -0,2,2,2,2,2,1,2, -2,0,2,2,2,2,1,2, -0,1,3,3,3,3,2,3, -2,2,0,2,2,2,1,2, -0,3,1,3,3,3,2,3, -3,0,1,3,3,3,2,3, -0,1,2,4,4,4,3,4, -2,2,2,0,2,2,1,2, -0,3,3,1,3,3,2,3, -3,0,3,1,3,3,2,3, -0,1,4,2,4,4,3,4, -3,3,0,1,3,3,2,3, -0,4,1,2,4,4,3,4, -4,0,1,2,4,4,3,4, -0,1,2,3,5,5,4,5, -2,2,2,2,0,2,1,2, -0,3,3,3,1,3,2,3, -3,0,3,3,1,3,2,3, -0,1,4,4,2,4,3,4, -3,3,0,3,1,3,2,3, -0,4,1,4,2,4,3,4, -4,0,1,4,2,4,3,4, -0,1,2,5,3,5,4,5, -3,3,3,0,1,3,2,3, -0,4,4,1,2,4,3,4, -4,0,4,1,2,4,3,4, -0,1,5,2,3,5,4,5, -4,4,0,1,2,4,3,4, -0,5,1,2,3,5,4,5, -5,0,1,2,3,5,4,5, -0,1,2,3,4,6,5,6, -2,2,2,2,2,0,1,2, -0,3,3,3,3,1,2,3, -3,0,3,3,3,1,2,3, -0,1,4,4,4,2,3,4, -3,3,0,3,3,1,2,3, -0,4,1,4,4,2,3,4, -4,0,1,4,4,2,3,4, -0,1,2,5,5,3,4,5, -3,3,3,0,3,1,2,3, -0,4,4,1,4,2,3,4, -4,0,4,1,4,2,3,4, -0,1,5,2,5,3,4,5, -4,4,0,1,4,2,3,4, -0,5,1,2,5,3,4,5, -5,0,1,2,5,3,4,5, -0,1,2,3,6,4,5,6, -3,3,3,3,0,1,2,3, -0,4,4,4,1,2,3,4, -4,0,4,4,1,2,3,4, -0,1,5,5,2,3,4,5, -4,4,0,4,1,2,3,4, -0,5,1,5,2,3,4,5, -5,0,1,5,2,3,4,5, -0,1,2,6,3,4,5,6, -4,4,4,0,1,2,3,4, -0,5,5,1,2,3,4,5, -5,0,5,1,2,3,4,5, -0,1,6,2,3,4,5,6, -5,5,0,1,2,3,4,5, -0,6,1,2,3,4,5,6, -6,0,1,2,3,4,5,6, -0,1,2,3,4,5,6,7, -1,1,1,1,1,1,1,0, -0,2,2,2,2,2,2,1, -2,0,2,2,2,2,2,1, -0,1,3,3,3,3,3,2, -2,2,0,2,2,2,2,1, -0,3,1,3,3,3,3,2, -3,0,1,3,3,3,3,2, -0,1,2,4,4,4,4,3, -2,2,2,0,2,2,2,1, -0,3,3,1,3,3,3,2, -3,0,3,1,3,3,3,2, -0,1,4,2,4,4,4,3, -3,3,0,1,3,3,3,2, -0,4,1,2,4,4,4,3, -4,0,1,2,4,4,4,3, -0,1,2,3,5,5,5,4, -2,2,2,2,0,2,2,1, -0,3,3,3,1,3,3,2, -3,0,3,3,1,3,3,2, -0,1,4,4,2,4,4,3, -3,3,0,3,1,3,3,2, -0,4,1,4,2,4,4,3, -4,0,1,4,2,4,4,3, -0,1,2,5,3,5,5,4, -3,3,3,0,1,3,3,2, -0,4,4,1,2,4,4,3, -4,0,4,1,2,4,4,3, -0,1,5,2,3,5,5,4, -4,4,0,1,2,4,4,3, -0,5,1,2,3,5,5,4, -5,0,1,2,3,5,5,4, -0,1,2,3,4,6,6,5, -2,2,2,2,2,0,2,1, -0,3,3,3,3,1,3,2, -3,0,3,3,3,1,3,2, -0,1,4,4,4,2,4,3, -3,3,0,3,3,1,3,2, -0,4,1,4,4,2,4,3, -4,0,1,4,4,2,4,3, -0,1,2,5,5,3,5,4, -3,3,3,0,3,1,3,2, -0,4,4,1,4,2,4,3, -4,0,4,1,4,2,4,3, -0,1,5,2,5,3,5,4, -4,4,0,1,4,2,4,3, -0,5,1,2,5,3,5,4, -5,0,1,2,5,3,5,4, -0,1,2,3,6,4,6,5, -3,3,3,3,0,1,3,2, -0,4,4,4,1,2,4,3, -4,0,4,4,1,2,4,3, -0,1,5,5,2,3,5,4, -4,4,0,4,1,2,4,3, -0,5,1,5,2,3,5,4, -5,0,1,5,2,3,5,4, -0,1,2,6,3,4,6,5, -4,4,4,0,1,2,4,3, -0,5,5,1,2,3,5,4, -5,0,5,1,2,3,5,4, -0,1,6,2,3,4,6,5, -5,5,0,1,2,3,5,4, -0,6,1,2,3,4,6,5, -6,0,1,2,3,4,6,5, -0,1,2,3,4,5,7,6, -2,2,2,2,2,2,0,1, -0,3,3,3,3,3,1,2, -3,0,3,3,3,3,1,2, -0,1,4,4,4,4,2,3, -3,3,0,3,3,3,1,2, -0,4,1,4,4,4,2,3, -4,0,1,4,4,4,2,3, -0,1,2,5,5,5,3,4, -3,3,3,0,3,3,1,2, -0,4,4,1,4,4,2,3, -4,0,4,1,4,4,2,3, -0,1,5,2,5,5,3,4, -4,4,0,1,4,4,2,3, -0,5,1,2,5,5,3,4, -5,0,1,2,5,5,3,4, -0,1,2,3,6,6,4,5, -3,3,3,3,0,3,1,2, -0,4,4,4,1,4,2,3, -4,0,4,4,1,4,2,3, -0,1,5,5,2,5,3,4, -4,4,0,4,1,4,2,3, -0,5,1,5,2,5,3,4, -5,0,1,5,2,5,3,4, -0,1,2,6,3,6,4,5, -4,4,4,0,1,4,2,3, -0,5,5,1,2,5,3,4, -5,0,5,1,2,5,3,4, -0,1,6,2,3,6,4,5, -5,5,0,1,2,5,3,4, -0,6,1,2,3,6,4,5, -6,0,1,2,3,6,4,5, -0,1,2,3,4,7,5,6, -3,3,3,3,3,0,1,2, -0,4,4,4,4,1,2,3, -4,0,4,4,4,1,2,3, -0,1,5,5,5,2,3,4, -4,4,0,4,4,1,2,3, -0,5,1,5,5,2,3,4, -5,0,1,5,5,2,3,4, -0,1,2,6,6,3,4,5, -4,4,4,0,4,1,2,3, -0,5,5,1,5,2,3,4, -5,0,5,1,5,2,3,4, -0,1,6,2,6,3,4,5, -5,5,0,1,5,2,3,4, -0,6,1,2,6,3,4,5, -6,0,1,2,6,3,4,5, -0,1,2,3,7,4,5,6, -4,4,4,4,0,1,2,3, -0,5,5,5,1,2,3,4, -5,0,5,5,1,2,3,4, -0,1,6,6,2,3,4,5, -5,5,0,5,1,2,3,4, -0,6,1,6,2,3,4,5, -6,0,1,6,2,3,4,5, -0,1,2,7,3,4,5,6, -5,5,5,0,1,2,3,4, -0,6,6,1,2,3,4,5, -6,0,6,1,2,3,4,5, -0,1,7,2,3,4,5,6, -6,6,0,1,2,3,4,5, -0,7,1,2,3,4,5,6, -7,0,1,2,3,4,5,6, -0,1,2,3,4,5,6,7 -}; -#define u2vmask(_m_,_tv_) _mm256_sllv_epi32(_mm256_set1_epi8(_m_), _tv_) -#define mm256_maskz_expand_epi32(_m_, _v_) _mm256_permutevar8x32_epi32(_v_, _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(ctou64(permv[_m_]))) ) -#define mm256_maskz_loadu_epi32(_m_,_v_) _mm256_blendv_epi8(zv, mm256_maskz_expand_epi32(xm, _mm256_loadu_si256((__m256i*)pex)), u2vmask(xm,tv)) - #endif - -//----------------------------------------------------------------------------- -#define VSTO( _op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, ov) -#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) -#include "bitunpack256v_.h" - -#define BITUNBLK256V32_0(ip, _i_, _op_, _parm_) {__m256i ov;\ - VSTO0(_op_, 0, ov, _parm_);\ - VSTO0(_op_, 1, ov, _parm_);\ - VSTO0(_op_, 2, ov, _parm_);\ - VSTO0(_op_, 3, ov, _parm_);\ - VSTO0(_op_, 4, ov, _parm_);\ - VSTO0(_op_, 5, ov, _parm_);\ - VSTO0(_op_, 6, ov, _parm_);\ - VSTO0(_op_, 7, ov, _parm_);\ - VSTO0(_op_, 8, ov, _parm_);\ - VSTO0(_op_, 9, ov, _parm_);\ - VSTO0(_op_, 10, ov, _parm_);\ - VSTO0(_op_, 11, ov, _parm_);\ - VSTO0(_op_, 12, ov, _parm_);\ - VSTO0(_op_, 13, ov, _parm_);\ - VSTO0(_op_, 14, ov, _parm_);\ - VSTO0(_op_, 15, ov, _parm_);\ - VSTO0(_op_, 16, ov, _parm_);\ - VSTO0(_op_, 17, ov, _parm_);\ - VSTO0(_op_, 18, ov, _parm_);\ - VSTO0(_op_, 19, ov, _parm_);\ - VSTO0(_op_, 20, ov, _parm_);\ - VSTO0(_op_, 21, ov, _parm_);\ - VSTO0(_op_, 22, ov, _parm_);\ - VSTO0(_op_, 23, ov, _parm_);\ - VSTO0(_op_, 24, ov, _parm_);\ - VSTO0(_op_, 25, ov, _parm_);\ - VSTO0(_op_, 26, ov, _parm_);\ - VSTO0(_op_, 27, ov, _parm_);\ - VSTO0(_op_, 28, ov, _parm_);\ - VSTO0(_op_, 29, ov, _parm_);\ - VSTO0(_op_, 30, ov, _parm_);\ - VSTO0(_op_, 31, ov, _parm_);\ -} -#define BITUNPACK0(_parm_) _parm_ = _mm256_setzero_si256() - -unsigned char *bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv; - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//--------------------------------------- zeromask unpack for TurboPFor vp4d.c -------------------------------------- -#define VSTO(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) )); pex += popcnt32(xm) -#define VSTO0(_op_, _i_, _ov_, _parm_) xm = *bb++; _mm256_storeu_si256(_op_++, mm256_maskz_loadu_epi32(xm,(__m256i*)pex) ); pex += popcnt32(xm) -#define BITUNPACK0(_parm_) -#include "bitunpack256v_.h" - -unsigned char *_bitunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; __m256i sv, zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -//-------------------------------- -#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_) -#define VSTO(__op, i, __ov, __sv) __ov = UNZIGZAG256x32(__ov); SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) -#include "bitunpack256v_.h" - -#define BITUNPACK0(_parm_) - -unsigned char *bitzunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCAN256x32(__ov,__sv); _mm256_storeu_si256(__op++, __sv) -#include "bitunpack256v_.h" - -#define BITUNPACK0(_parm_) - -unsigned char *bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start), zv = _mm256_setzero_si256(); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VEXP(_i_, _ov_) xm = *bb++; _ov_ = _mm256_add_epi32(_ov_, _mm256_slli_epi32(mm256_maskz_loadu_epi32(xm,(__m256i*)pex), b) ); pex += popcnt32(xm) -#define VEXP0(_i_, _ov_) xm = *bb++; _ov_ = mm256_maskz_loadu_epi32(xm,(__m256i*)pex); pex += popcnt32(xm) - -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCAN256x32(_ov_,_sv_); _mm256_storeu_si256(_op_++, _sv_); - -#include "bitunpack256v_.h" - -#define BITUNPACK0(_parm_) - -unsigned char *_bitdunpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; - __m256i sv = _mm256_set1_epi32(start),zv = _mm256_setzero_si256(), tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 - -//----------------------------------------------------------------------------- -#define VSTO(__op, i, __ov, __sv) SCANI256x32(__ov,__sv,cv); _mm256_storeu_si256(__op++, __sv); -#define VSTO0(_op_, _i_, ov, _parm_) _mm256_storeu_si256(_op_++, _parm_); _parm_ = _mm256_add_epi32(_parm_, cv) -#include "bitunpack256v_.h" - -#define BITUNPACK0(_parm_) _parm_ = _mm256_add_epi32(_parm_, cv); cv = _mm256_set1_epi32(8) - -unsigned char *bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b) { - const unsigned char *ip = in+PAD8(256*b); - __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -//----------------------------------------------------------------------------- -#define VSTO( _op_, _i_, _ov_, _sv_) VEXP( _i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); -#define VSTO0(_op_, _i_, _ov_, _sv_) VEXP0(_i_, _ov_); SCANI256x32(_ov_,_sv_,cv); _mm256_storeu_si256(_op_++, _sv_); - -#include "bitunpack256v_.h" - -#define BITUNPACK0(_parm_) mv = _mm256_set1_epi32(0) //_parm_ = _mm_setzero_si128() - -unsigned char *_bitd1unpack256v32( const unsigned char *__restrict in, unsigned n, unsigned *__restrict out, unsigned start, unsigned b, unsigned *__restrict pex, unsigned char *bb) { - const unsigned char *ip = in+PAD8(256*b); unsigned xm; - __m256i sv = _mm256_set1_epi32(start), cv = _mm256_set_epi32(8,7,6,5,4,3,2,1),zv = _mm256_setzero_si256(),tv = _mm256_set_epi32(0,1,2,3,4,5,6,7); - BITUNPACK256V32(in, b, out, sv); - return (unsigned char *)ip; -} -#undef VSTO -#undef VSTO0 -#undef BITUNPACK0 -#endif