diff --git a/bitunpack_.h b/bitunpack_.h index e65295d..cebbbe9 100644 --- a/bitunpack_.h +++ b/bitunpack_.h @@ -3649,21 +3649,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_1(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 7),mv); VO16(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 9),mv); VO16(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 10),mv); VO16(op,i*16+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 11),mv); VO16(op,i*16+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 12),mv); VO16(op,i*16+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 13),mv); VO16(op,i*16+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 14),mv); VO16(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*16+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*16+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 7),mv); VO16(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 9),mv); VO16(op,i*16+ 9,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 10),mv); VO16(op,i*16+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 11),mv); VO16(op,i*16+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 12),mv); VO16(op,i*16+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 13),mv); VO16(op,i*16+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 14),mv); VO16(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_1(ip, op, nb,parm) {\ @@ -3672,13 +3672,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_2(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*8+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*8+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*8+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*8+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 10),mv); VO16(op,i*8+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 12),mv); VO16(op,i*8+ 6,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); VO16(op,i*8+ 7,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*8+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*8+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*8+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*8+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 10),mv); VO16(op,i*8+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 12),mv); VO16(op,i*8+ 6,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); VO16(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V16_2(ip, op, nb,parm) {\ @@ -3688,21 +3688,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_3(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 9),mv); VO16(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 12),mv); VO16(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 11),mv); VO16(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*16+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 7),mv); VO16(op,i*16+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 10),mv); VO16(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 13); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*16+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 9),mv); VO16(op,i*16+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 12),mv); VO16(op,i*16+ 4,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 11),mv); VO16(op,i*16+ 9,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*16+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 7),mv); VO16(op,i*16+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 10),mv); VO16(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 13); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_3(ip, op, nb,parm) {\ @@ -3711,9 +3711,9 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_4(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*4+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*4+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*4+ 2,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); VO16(op,i*4+ 3,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*4+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*4+ 2,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); VO16(op,i*4+ 3,ov,nb,parm); ;\ } #define BITUNPACK128V16_4(ip, op, nb,parm) {\ @@ -3725,21 +3725,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_5(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 10),mv); VO16(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 9),mv); VO16(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 7),mv); VO16(op,i*16+11,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 11); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 10),mv); VO16(op,i*16+ 2,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*16+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 9),mv); VO16(op,i*16+ 5,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 7),mv); VO16(op,i*16+11,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 11); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_5(ip, op, nb,parm) {\ @@ -3748,13 +3748,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_6(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*8+ 1,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*8+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*8+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*8+ 4,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*8+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*8+ 6,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 10); VO16(op,i*8+ 7,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*8+ 1,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*8+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*8+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*8+ 4,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*8+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*8+ 6,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 10); VO16(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V16_6(ip, op, nb,parm) {\ @@ -3764,21 +3764,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_7(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 7),mv); VO16(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*16+10,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*16+12,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 9); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi16(iv, 7),mv); VO16(op,i*16+ 1,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 5),mv); VO16(op,i*16+ 3,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 5,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi16(iv, 8),mv); VO16(op,i*16+ 8,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*16+10,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*16+12,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 9); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_7(ip, op, nb,parm) {\ @@ -3787,7 +3787,7 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_8(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*2+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 8); VO16(op,i*2+ 1,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 8); VO16(op,i*2+ 1,ov,nb,parm); ;\ } #define BITUNPACK128V16_8(ip, op, nb,parm) {\ @@ -3803,21 +3803,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_9(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 6),mv); VO16(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 3),mv); VO16(op,i*16+11,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 5),mv); VO16(op,i*16+13,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 7); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 2,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*16+ 4,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 6),mv); VO16(op,i*16+ 6,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 9,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 3),mv); VO16(op,i*16+11,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 5),mv); VO16(op,i*16+13,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 7); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_9(ip, op, nb,parm) {\ @@ -3826,13 +3826,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_10(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*8+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*8+ 2,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*8+ 3,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*8+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*8+ 5,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*8+ 6,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 6); VO16(op,i*8+ 7,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*8+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*8+ 2,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*8+ 3,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*8+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*8+ 5,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*8+ 6,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 6); VO16(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V16_10(ip, op, nb,parm) {\ @@ -3842,21 +3842,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_11(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 10), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 9), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+10,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 4),mv); VO16(op,i*16+12,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+13,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 5); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 10), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 3,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 9), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+ 6,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 3),mv); VO16(op,i*16+ 9,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+10,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 4),mv); VO16(op,i*16+12,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+13,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 5); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_11(ip, op, nb,parm) {\ @@ -3866,9 +3866,9 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_12(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*4+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*4+ 1,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*4+ 2,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 4); VO16(op,i*4+ 3,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*4+ 1,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*4+ 2,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 4); VO16(op,i*4+ 3,ov,nb,parm); ;\ } #define BITUNPACK128V16_12(ip, op, nb,parm) {\ @@ -3880,21 +3880,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_13(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 9), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 12), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 11), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi16(iv, 2),mv); VO16(op,i*16+10,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+11,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+12,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+13,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 10), mv)); VO16(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 3); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 9), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 12), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 1),mv); VO16(op,i*16+ 5,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 11), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi16(iv, 2),mv); VO16(op,i*16+10,ov,nb,parm); \ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+11,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+12,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+13,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 10), mv)); VO16(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 3); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_13(ip, op, nb,parm) {\ @@ -3903,13 +3903,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_14(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*8+ 1,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*8+ 2,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*8+ 3,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*8+ 4,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 10), mv)); VO16(op,i*8+ 5,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 12), mv)); VO16(op,i*8+ 6,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 2); VO16(op,i*8+ 7,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*8+ 1,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*8+ 2,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*8+ 3,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*8+ 4,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 10), mv)); VO16(op,i*8+ 5,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 12), mv)); VO16(op,i*8+ 6,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 2); VO16(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V16_14(ip, op, nb,parm) {\ @@ -3919,21 +3919,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V16_15(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO16(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 9), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 10), mv)); VO16(op,i*16+10,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 11), mv)); VO16(op,i*16+11,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 12), mv)); VO16(op,i*16+12,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 13), mv)); VO16(op,i*16+13,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi16(iv, 14), mv)); VO16(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi16(iv, 1); VO16(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi16(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 1), mv)); VO16(op,i*16+ 1,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 2), mv)); VO16(op,i*16+ 2,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 3), mv)); VO16(op,i*16+ 3,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 4), mv)); VO16(op,i*16+ 4,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 5), mv)); VO16(op,i*16+ 5,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 6), mv)); VO16(op,i*16+ 6,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 7), mv)); VO16(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 8), mv)); VO16(op,i*16+ 8,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 9), mv)); VO16(op,i*16+ 9,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 10), mv)); VO16(op,i*16+10,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 11), mv)); VO16(op,i*16+11,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 12), mv)); VO16(op,i*16+12,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 13), mv)); VO16(op,i*16+13,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi16(iv, 14), mv)); VO16(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi16(iv, 1); VO16(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V16_15(ip, op, nb,parm) {\ @@ -3969,37 +3969,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_1(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 27),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 29),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 30),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 19),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 21),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 23),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 25),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 26),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 27),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 28),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 29),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 30),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_1(ip, op, nb,parm) {\ @@ -4008,21 +4008,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_2(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*16+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*16+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*16+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VO32(op,i*16+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VO32(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*16+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*16+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*16+ 9,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*16+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*16+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*16+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 26),mv); VO32(op,i*16+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 28),mv); VO32(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_2(ip, op, nb,parm) {\ @@ -4032,37 +4032,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_3(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 27),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 28),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 21),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 27),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 19),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 25),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 28),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 23),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 26),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_3(ip, op, nb,parm) {\ @@ -4071,13 +4071,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_4(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*8+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*8+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*8+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*8+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*8+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*8+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); VO32(op,i*8+ 7,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*8+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*8+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*8+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*8+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*8+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*8+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); VO32(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V32_4(ip, op, nb,parm) {\ @@ -4089,37 +4089,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_5(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 25),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 26),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 25),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 23),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 21),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 26),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 19),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_5(ip, op, nb,parm) {\ @@ -4128,21 +4128,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_6(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*16+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*16+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*16+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*16+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*16+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*16+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*16+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*16+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_6(ip, op, nb,parm) {\ @@ -4152,37 +4152,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_7(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 24),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 23),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 21),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 24),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 23),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 19),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_7(ip, op, nb,parm) {\ @@ -4191,9 +4191,9 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_8(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*4+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*4+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*4+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); VO32(op,i*4+ 3,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*4+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*4+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); VO32(op,i*4+ 3,ov,nb,parm); ;\ } #define BITUNPACK128V32_8(ip, op, nb,parm) {\ @@ -4209,37 +4209,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_9(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 22),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 21),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 22),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 21),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 19),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_9(ip, op, nb,parm) {\ @@ -4248,21 +4248,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_10(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*16+11,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+13,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 1,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*16+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*16+ 4,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*16+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*16+11,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+13,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_10(ip, op, nb,parm) {\ @@ -4272,37 +4272,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_11(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 19),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 20),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 21); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 19),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 20),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 21); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_11(ip, op, nb,parm) {\ @@ -4311,13 +4311,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_12(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*8+ 1,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*8+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*8+ 3,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*8+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*8+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*8+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); VO32(op,i*8+ 7,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*8+ 1,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*8+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*8+ 3,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*8+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*8+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*8+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); VO32(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V32_12(ip, op, nb,parm) {\ @@ -4329,37 +4329,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_13(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 17),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 18),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 19); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 17),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 18),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 19); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_13(ip, op, nb,parm) {\ @@ -4368,21 +4368,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_14(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*16+ 1,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 7,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*16+10,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*16+12,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 18); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*16+ 1,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*16+ 3,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 7,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*16+ 8,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*16+10,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*16+12,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 18); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_14(ip, op, nb,parm) {\ @@ -4392,37 +4392,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_15(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 15),mv); VO32(op,i*32+ 1,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+15,ov,nb,parm); \ - ov = _mm_and_si128(_mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+30,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 17); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = _mm_and_si128( mm_srli_epi32(iv, 15),mv); VO32(op,i*32+ 1,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+15,ov,nb,parm); \ + ov = _mm_and_si128( mm_srli_epi32(iv, 16),mv); VO32(op,i*32+16,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+30,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 17); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_15(ip, op, nb,parm) {\ @@ -4431,7 +4431,7 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_16(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*2+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 16); VO32(op,i*2+ 1,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 16); VO32(op,i*2+ 1,ov,nb,parm); ;\ } #define BITUNPACK128V32_16(ip, op, nb,parm) {\ @@ -4455,37 +4455,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_17(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 14),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 13),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 14),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 13),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_17(ip, op, nb,parm) {\ @@ -4494,21 +4494,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_18(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*16+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*16+11,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*16+13,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*16+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*16+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*16+11,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*16+13,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_18(ip, op, nb,parm) {\ @@ -4518,37 +4518,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_19(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 12),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 11),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 12),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 11),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_19(ip, op, nb,parm) {\ @@ -4557,13 +4557,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_20(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*8+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*8+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*8+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*8+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*8+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*8+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); VO32(op,i*8+ 7,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*8+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*8+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*8+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*8+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*8+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*8+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); VO32(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V32_20(ip, op, nb,parm) {\ @@ -4575,37 +4575,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_21(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 10),mv); VO32(op,i*32+ 2,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+11,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+20,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+23,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+26,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+29,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 11); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 10),mv); VO32(op,i*32+ 2,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 9),mv); VO32(op,i*32+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+ 8,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+11,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+20,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+23,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+26,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+29,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 11); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_21(ip, op, nb,parm) {\ @@ -4614,21 +4614,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_22(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 3,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+10,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*16+12,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+13,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 3,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*16+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+10,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*16+12,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+13,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_22(ip, op, nb,parm) {\ @@ -4638,37 +4638,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_23(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 3,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 7,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+14,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 7),mv); VO32(op,i*32+17,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+28,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 9); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+ 3,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 7,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+14,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 7),mv); VO32(op,i*32+17,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 8),mv); VO32(op,i*32+24,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+28,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 9); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_23(ip, op, nb,parm) {\ @@ -4677,9 +4677,9 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_24(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*4+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*4+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*4+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); VO32(op,i*4+ 3,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*4+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*4+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); VO32(op,i*4+ 3,ov,nb,parm); ;\ } #define BITUNPACK128V32_24(ip, op, nb,parm) {\ @@ -4695,37 +4695,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_25(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 9,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 5),mv); VO32(op,i*32+13,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+18,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 6),mv); VO32(op,i*32+22,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+27,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 7); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+ 4,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+ 9,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 5),mv); VO32(op,i*32+13,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+18,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 6),mv); VO32(op,i*32+22,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+27,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 7); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_25(ip, op, nb,parm) {\ @@ -4734,21 +4734,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_26(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 5,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*16+10,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+11,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+12,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+13,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 6); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*16+ 5,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*16+10,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+11,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+12,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+13,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 6); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_26(ip, op, nb,parm) {\ @@ -4758,37 +4758,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_27(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 6,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+19,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 3),mv); VO32(op,i*32+25,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 5); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 25), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+ 6,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 4),mv); VO32(op,i*32+12,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 26), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+19,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 3),mv); VO32(op,i*32+25,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 5); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_27(ip, op, nb,parm) {\ @@ -4797,13 +4797,13 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_28(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*8+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*8+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*8+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*8+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*8+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*8+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*8+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 4); VO32(op,i*8+ 7,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*8+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*8+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*8+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*8+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*8+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*8+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 4); VO32(op,i*8+ 7,ov,nb,parm); ;\ } #define BITUNPACK128V32_28(ip, op, nb,parm) {\ @@ -4815,37 +4815,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_29(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 2),mv); VO32(op,i*32+10,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_and_si128(_mm_srli_epi32(iv, 1),mv); VO32(op,i*32+21,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 3); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 27), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 2),mv); VO32(op,i*32+10,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 25), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 28), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = _mm_and_si128( mm_srli_epi32(iv, 1),mv); VO32(op,i*32+21,ov,nb,parm); \ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 26), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 3); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_29(ip, op, nb,parm) {\ @@ -4854,21 +4854,21 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_30(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*16+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*16+10,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*16+11,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*16+12,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VO32(op,i*16+13,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VO32(op,i*16+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 2); VO32(op,i*16+15,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*16+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*16+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*16+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*16+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*16+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*16+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*16+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*16+ 8,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*16+ 9,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*16+10,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*16+11,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*16+12,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 26), mv)); VO32(op,i*16+13,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 28), mv)); VO32(op,i*16+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 2); VO32(op,i*16+15,ov,nb,parm); ;\ } #define BITUNPACK128V32_30(ip, op, nb,parm) {\ @@ -4878,37 +4878,37 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un #define BITUNBLK128V32_31(ip, i, op, nb,parm) { __m128i ov,iv = _mm_loadu_si128((__m128i *)ip++);\ ov = _mm_and_si128( iv ,mv); VO32(op,i*32+ 0,ov,nb,parm); \ - ov = _mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+10,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+11,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+12,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+13,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+14,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+15,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+17,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+18,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+19,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+20,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+21,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+22,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+23,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+24,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 25), mv)); VO32(op,i*32+25,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 26), mv)); VO32(op,i*32+26,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 27), mv)); VO32(op,i*32+27,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 28), mv)); VO32(op,i*32+28,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 29), mv)); VO32(op,i*32+29,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128(_mm_slli_epi32(iv, 30), mv)); VO32(op,i*32+30,ov,nb,parm);\ - ov = _mm_srli_epi32(iv, 1); VO32(op,i*32+31,ov,nb,parm); ;\ + ov = mm_srli_epi32(iv, 31); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 1), mv)); VO32(op,i*32+ 1,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 30); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 2), mv)); VO32(op,i*32+ 2,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 29); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 3), mv)); VO32(op,i*32+ 3,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 28); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 4), mv)); VO32(op,i*32+ 4,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 27); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 5), mv)); VO32(op,i*32+ 5,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 26); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 6), mv)); VO32(op,i*32+ 6,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 25); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 7), mv)); VO32(op,i*32+ 7,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 24); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 8), mv)); VO32(op,i*32+ 8,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 23); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 9), mv)); VO32(op,i*32+ 9,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 22); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 10), mv)); VO32(op,i*32+10,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 21); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 11), mv)); VO32(op,i*32+11,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 20); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 12), mv)); VO32(op,i*32+12,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 19); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 13), mv)); VO32(op,i*32+13,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 18); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 14), mv)); VO32(op,i*32+14,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 17); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 15), mv)); VO32(op,i*32+15,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 16); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 16), mv)); VO32(op,i*32+16,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 15); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 17), mv)); VO32(op,i*32+17,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 14); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 18), mv)); VO32(op,i*32+18,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 13); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 19), mv)); VO32(op,i*32+19,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 12); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 20), mv)); VO32(op,i*32+20,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 11); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 21), mv)); VO32(op,i*32+21,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 10); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 22), mv)); VO32(op,i*32+22,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 9); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 23), mv)); VO32(op,i*32+23,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 8); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 24), mv)); VO32(op,i*32+24,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 7); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 25), mv)); VO32(op,i*32+25,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 6); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 26), mv)); VO32(op,i*32+26,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 5); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 27), mv)); VO32(op,i*32+27,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 4); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 28), mv)); VO32(op,i*32+28,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 3); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 29), mv)); VO32(op,i*32+29,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 2); iv = _mm_loadu_si128((__m128i *)ip++); ov = _mm_or_si128(ov, _mm_and_si128( mm_slli_epi32(iv, 30), mv)); VO32(op,i*32+30,ov,nb,parm);\ + ov = mm_srli_epi32(iv, 1); VO32(op,i*32+31,ov,nb,parm); ;\ } #define BITUNPACK128V32_31(ip, op, nb,parm) {\ @@ -5964,7 +5964,7 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un case 13: mv = _mm_set1_epi16((1u<<13)-1); BITUNPACK128V16_13(_iv, _ov, 13,_parm_); break;\ case 14: mv = _mm_set1_epi16((1u<<14)-1); BITUNPACK128V16_14(_iv, _ov, 14,_parm_); break;\ case 15: mv = _mm_set1_epi16((1u<<15)-1); BITUNPACK128V16_15(_iv, _ov, 15,_parm_); break;\ - case 16: mv = _mm_set1_epi16((1u<<16)-1); BITUNPACK128V16_16(_iv, _ov, 16,_parm_); break;\ + case 16: mv = _mm_set1_epi16((1u<<16)-1); BITUNPACK128V16_16(_iv, _ov, BITMAX16,_parm_); break;\ /*defaultcase 17 ... 63: break;*/\ }\ } @@ -6003,7 +6003,7 @@ unsigned char *TEMPLATE2(_BITUNPACK_,64)( const unsigned char *__restrict in, un case 29: mv = _mm_set1_epi32((1u<<29)-1); BITUNPACK128V32_29(_iv, _ov, 29,_parm_); break;\ case 30: mv = _mm_set1_epi32((1u<<30)-1); BITUNPACK128V32_30(_iv, _ov, 30,_parm_); break;\ case 31: mv = _mm_set1_epi32((1u<<31)-1); BITUNPACK128V32_31(_iv, _ov, 31,_parm_); break;\ - case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACK128V32_32(_iv, _ov, 32,_parm_); break;\ + case 32: mv = _mm_set1_epi32((1ull<<32)-1);BITUNPACK128V32_32(_iv, _ov, BITMAX32,_parm_); break;\ /*defaultcase 33 ... 63: break;*/\ }\ }